# Let's avoid messages and warnings in
# SA_Amazon_Insights&Results_Revisited.html.
# Anyway, messages and warnings produced
# by the code have already been dealt with.
knitr::opts_chunk$set(echo = TRUE,
message = FALSE,
warning = FALSE)
# The next opts_chunk fully deploys figures
# and centers them.
knitr::opts_chunk$set(out.width = "100%",
fig.align = "center")
# The next instruction facilitates
# table layout in HTML.
options(knitr.table.format = "html")
# The string <br> is used to generate empty lines.
91 % prediction accuracy has been reached in this project predicting sentiment polarity on Amazon reviews. Classification is binary, with positive and negative polarity. The final model is delivered by the algorithm eXtreme Gradient Boosting Tree; CART is used in the training process because it is swift and delivers clear outputs.
Natural Language Processing and Text Mining have contributed the accuracy performance.
Natural Language Processing was about corpus, lowercasing, punctuation handling, stopword removal, stemming, tokenization, and bag of words.
Text mining has brought insights about subjective information. In CART trees predominate some tokens conveying subjective information; but other tokens containing subjective information were not used in false negatives and false positives. Subjective information has been retrieved exclusively from the training set; customized lists have been established with tokens sorted as having either positive or negative sentiment orientation; occurrences of these tokens in reviews have been replaced with either a positive or a negative generic token. Text Mining has also brought another insight, quantitatively less impactful though: negation and negative short forms (contractions) have being integrated in the process; they have been used among others to automatically flip sentiment polarity of the generic tokens when preceding these generic tokens.
A previous version of this project can be found in https://github.com/Dev-P-L/Sentiment-Analysis--Amazon-Reviews . This version has introduced interactive tables and figures, which have greatly facilitated and extended text mining, leading to higher accuracy.
The previous version had extensively looked for Machine Learning optimization, which had been conducted across ten models on accuracy distributions generated by bootstrap resampling. eXtreme Gradient Boosting Linear had emerged as the most performing model. This conclusion has been overtaken in the current version without further testing, changing only XGBoost Linear for XGBoost Tree, further to results reported in data science literature. XGBoost Tree has very slightly contributed accuracy, text mining contributing much more this time.
TAGS: sentiment analysis, natural language processing, text mining, subjective information, tokenization, bag of words, word frequency, interactive tables, interactive figures, decision trees, false negatives, false positives, text classification, polarization, lists of positive n-grams, lists of negative n-grams, text substitution, machine learning, binary classification, CART, eXtreme Gradient Boosting Tree, R
GITHUB: https://github.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited
Dear Readers, you are most welcome to run the project on your own computer if you so wish.
This project is lodged with the GitHub repository https://github.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited .
It is comprised of fourteen files. All code is included in SA_Amazon_Code_Revisited.Rmd. It is hidden by default in the result report, called SA_Amazon_Insights&Results_Revisited.html, but can be accessed with buttons.
For your convenience, the dataset has already been downloaded onto the GitHub repository wherefrom it will be automatically retrieved by the code from SA_Amazon_Code_Revisited.Rmd. If you so wish, you can also easily retrieve the dataset from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences and adapt the SA_Amazon_Code_Revisited.Rmd code accordingly.
You can knit SA_Amazon_Code_Revisited.Rmd (please in HTML) and produce SA_Amazon_Insights&Results_Revisited.html on your own computer. Before knitting SA_Amazon_Code_Revisited.Rmd (please in HTML) on your computer, don’t forget to copy the file styles.css from the GitHub repository into the same folder as SA_Amazon_Code_Revisited.Rmd.
Some packages are required in SA_Amazon_Code_Revisited.Rmd. The code from SA_Amazon_Code_Revisited.Rmd contains instructions to download these packages if they are not available yet.
For information about my work environment, see the session info section at the end of this document.
# I. CLEANING USER INTERFACE FOR RAM MANAGEMENT.
# a. Clearing plots
invisible(if(!is.null(dev.list())) dev.off())
# b. Cleaning workspace
rm(list=ls())
# c. Cleaning console
cat("\014")########################################################################
# II. PACKAGES.
# a. Installing packages if necessary.
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(tm)) install.packages("tm", repos = "http://cran.us.r-project.org")
if(!require(SnowballC)) install.packages("SnowballC", repos = "http://cran.us.r-project.org")
if(!require(e1071)) install.packages("e1071", repos = "http://cran.us.r-project.org")
if(!require(wordcloud2)) install.packages("wordcloud2", repos = "http://cran.us.r-project.org")
if(!require(RColorBrewer)) install.packages("RColorBrewer", repos = "http://cran.us.r-project.org")
if(!require(caTools)) install.packages("caTools", repos = "http://cran.us.r-project.org")
if(!require(rpart)) install.packages("rpart", repos = "http://cran.us.r-project.org")
if(!require(rpart.plot)) install.packages("rpart.plot", repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
if(!require(xgboost)) install.packages("xgboost", repos = "http://cran.us.r-project.org")
if(!require(kableExtra)) install.packages("kableExtra", repos = "http://cran.us.r-project.org")
if(!require(gridExtra)) install.packages("gridExtra", repos = "http://cran.us.r-project.org")
if(!require(utf8)) install.packages("utf8", repos = "http://cran.us.r-project.org")
if(!require(devtools)) install.packages("devtools", repos = "http://cran.us.r-project.org")
if(!require(plotly)) install.packages("plotly", repos = "http://cran.us.r-project.org")
if(!require(htmltools)) install.packages("htmltools", repos = "http://cran.us.r-project.org")
if(!require(DT)) install.packages("DT", repos = "http://cran.us.r-project.org")
if(!require(utils)) install.packages("utils", repos = "http://cran.us.r-project.org")
# b. Requiring libraries.
library(tidyverse)
library(tm)
library(SnowballC)
library(e1071)
library(wordcloud2)
library(RColorBrewer)
library(caTools)
library(rpart)
library(rpart.plot)
library(caret)
library(xgboost)
library(kableExtra)
library(gridExtra)
library(utf8)
library(devtools)
library(plotly)
library(htmltools)
library(DT)
library(utils)
# c. Preventing wordclouds silently failing
# after the first wordcloud2.
# See https://github.com/Lchiffon/wordcloud2/issues/65 .
devtools::install_github("gaospecial/wordcloud2")
########################################################################
# III. COLOR PALETTE
dark_cerulean <- "#08457E"
dodger_blue <- "#0181ff"
greenish_blue <- "#507786"
light_gray <- "#808080"
super_light_gray <- "#a7a7a7"
harvard_crimson <- "#a41034"
light_taupe <- "#b38b6d"
super_light_taupe <- "#d6c0b0"
dark_paris_green <- "#319b54"
paris_green <- "#50C878"
super_light_paris_green <- "#8adaa5"
# For other hues, preexisting denominations
# will be used such as "powderblue". Now, let’s turn to data.
There are 1,000 reviews in the data set, from the UCI Machine Learning Repository.
As explained on the UCI Machine Learning Repository website, data is organized in a CSV file in two columns. In the first column, there are 1,000 Amazon product reviews (sentences). In the second column, there is a positive or negative evaluation; the ratio of positive evaluations is 50 %.
That file will be split into training reviews - two thirds of reviews - and validation reviews. Let’s have a quick look at the number of positive and negative reviews in the training set.
# Downloading data.
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/amazon_cells_labelled.txt"
reviews <- read.delim(myfile, header = FALSE,
sep = "\t", quote = "",
stringsAsFactors = FALSE)
rm(myfile)
reviews <- reviews %>%
`colnames<-`(c("text", "sentiment")) %>%
# Replacing numerical variable "sentiment" (0/1 values)
# with factor variable "sentiment" (Neg/Pos values).
mutate(sentiment = as.factor(gsub("1", " Pos",
gsub("0", "Neg", sentiment)))) %>%
as.data.frame()
# The leading empty space character in " Pos"
# cares for " Pos" coming first in the confusion matrix
# so that a "true positive" (review that is
# predicted positive and is actually positive)
# corresponds to positive review polarity.
# Creating training index and validation index.
set.seed(1)
ind_train <-
createDataPartition(y = reviews$sentiment,
times = 1, p = 2/3, list = FALSE)
ind_val <-
as.integer(setdiff(1:nrow(reviews), ind_train))
# ind_train allows to select the reviews that will be used
# for training, be it in NLP, in text mining or in ML.
# Building up the training set with the training index.
reviews_training <- reviews[ind_train, ] %>%
as.data.frame() %>%
`rownames<-`(1:nrow(.)) %>%
mutate(ro = rownames(.)) %>%
select(ro, everything())
# Some simple statistics in a table:
# numbers of positives reviews and of negative ones.
tab <- table(reviews_training$sentiment) %>%
as.data.frame() %>%
`colnames<-`(c("Review Polarity",
"Number of Reviews in Training Set"))
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, width = "2.5in", bold = T) %>%
column_spec(2, width = "3in", bold = T) %>%
row_spec(1, color = "white",
background = greenish_blue) %>%
row_spec(2, color = "white",
background = harvard_crimson)| Review Polarity | Number of Reviews in Training Set |
|---|---|
| Pos | 334 |
| Neg | 334 |
So, there are indeed as many reviews with positive sentiment polarity as reviews with negative sentiment polarity.
Let’s have a look at training reviews.
# Building up data frame.
tab <- reviews_training %>%
`colnames<-`(c("Row Number", "Training Review",
"Sentiment"))
# Building up interactive presentation table.
datatable(tab, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Setting background color and font color in header.
initComplete = JS(
'function(settings, json) {',
'$(this.api().table().header()).css({
"background-color": "#507786",
"color": "white"});',
'}'),
# Setting background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "powderblue";','}',
'}')
)
)In order to better catch the relationship between the reviews and the reviews sentiment polarity, let’s proceed to some Natural Language Processing. The idea is to detect words that impact sentiment polarity.
We have seen before that 50 % of reviews have positive sentiment polarity; of course, also 50 % of reviews have negative sentiment polarity.
Consequently, we cannot apply the baseline model in prediction. Indeed, considering that all reviews have e.g. positive polarity would deliver 50 % true positives and 50 % false positives, which would provide very low accuracy.
We do need additional information to predict. We are going to retrieve that information from words. So, let’s identify words.
To do so, we are going
Through NLP, we will get a bag of words that takes the form of a Document Term Matrix: the 668 rows correspond to the 668 training reviews; there is a column for each token. At the junction of each row and each column, there is a frequency number representing the occurrence of the corresponding token in the corresponding review.
Applying a sparsity threshold of .995 will only leave tokens that appear in at least 0.5 % of reviews.
As a pre-attentive insight, a wordcloud will show the most frequent tokens. The wordcloud is interactive: just hover over a token and you get the frequency of occurrence.
# Corpus is created on training reviews only to avoid
# any interference between training reviews
# and validation reviews. Otherwise, tokens
# from validation set could (slightly) impact
# token selection when applying the sparsity threshold.
corpus <-
VCorpus(VectorSource(reviews_training$text))
# Lowercasing, removing punctuation and stopwords,
# stemming document.
corpus <-
tm_map(corpus, content_transformer(tolower))
corpus <-
tm_map(corpus, removePunctuation)
corpus <-
tm_map(corpus, removeWords, stopwords("english"))
corpus <-
tm_map(corpus, stemDocument)
# Building up a bag of words in a Document Term Matrix.
dtm <- DocumentTermMatrix(corpus)
# Managing sparsity with sparsity threshold.
sparse <- removeSparseTerms(dtm, 0.995)
# Converting sparse, which is a DocumentTermMatrix,
# to a matrix and then to a data frame.
sentSparse <- as.data.frame(as.matrix(sparse))
# Making all column names R-friendly.
colnames(sentSparse) <- make.names(colnames(sentSparse))
# In order to get some pre-emptive insights
# into the bag of words, let's use a wordcloud.
# First, let's build up a data frame with only
# the 40 most frequent tokens from "sentSparse",
# i.e. the Document Term Matrix pruned
# by the sparsity process.
df <- data.frame(word = colnames(sentSparse),
freq = colSums(sentSparse)) %>%
filter(freq >= 10) %>%
arrange(desc(freq)) %>%
head(., 40)
# Second, let's create the wordcloud. Numerous colors
# are used to easily dissociate tokens.
set.seed(1)
wordcloud2(df, shape = 'square', color = 'random-light',
backgroundColor = harvard_crimson,
shuffle = FALSE)There are topic-related tokens such as phone, tokens conveying subjective information such as great, etc. Before analyzing token categories, let’s check up the technical adequacy of results from the NLP process.
The wordcloud above is an ergonomic tool to easily pinpoint some NLP flaws.
Some tokens were not expected, such as dont or ive, since they seem to originate in short forms and were expected to have been eliminated as stopwords.
Let’s start investigating with dont. The frequency of occurrence is at least 10 since that is a prerequisite to enter the wordcloud. But there can be more instances.
# In the training reviews, which rows contain a digit
# at least equal to 1 in the column "dont"?
bin <- which(sentSparse$dont >= 1)
# Building up a small presentation table.
df <- data.frame(length(bin)) %>%
`colnames<-`('Number of Training Reviews Containing "dont"')
# Layout of the table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = "white",
background = harvard_crimson) | Number of Training Reviews Containing “dont” |
|---|
| 20 |
Perusing the bag of words for rows containing dont has led to distinguishing two scenarios. The first one is an exception, but it can be generalized to other tokens. Here it is.
# Building up data frame.
df <-
data.frame(reviews_training$ro[bin[17]],
reviews_training$text[bin[17]]) %>%
`colnames<-`(c("Training Review Number",
'"dont" Originating in Misspelling'))
# Layout of the table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(1, bold = T, color = "white",
background = harvard_crimson) | Training Review Number | “dont” Originating in Misspelling |
|---|---|
| 544 | dont buy it. |
dont contains a spelling error or is, in more inclusive wording, alternative grammar: it has been used instead of don’t. Actually, there is only one occurrence in the bag of words. But it could happen more often and also with other short forms such as couldn’t, isn’t, … becoming couldnt, isnt, …
We are going to treat these misspelled short forms as if they were standardly written. This will be done in the next section Fine Tuning NLP.
Now, let’s have a look at the most common scenario that has generated dont. Let’s just show the single review with two occurrences.
# Localizing the cases that have originated
# in the short form standardly written with
# an apostrophe. These are all cases except
# the one in the first scenario above.
bin_2 <- bin[-17]
# Building up data frame.
tab <- reviews_training[bin_2, ]
tab <-
tab %>%
`colnames<-`(c("Row Number",
"Training Review Containing \"don't\"",
"Sentiment"))
# Building up interactive presentation table.
datatable(tab, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Setting background color and font color in header.
initComplete = JS(
'function(settings, json) {',
'$(this.api().table().header()).css({
"background-color": "#A41034",
"color": "white"});',
'}'),
# Setting background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)This is the general scenario: don’t has been standardly written and it was expected to have disappeared as all stopwords and nevertheless it is still in the bag of words since we have seen it in the bag of words wordcloud.
What happened? Before stopword removal, all punctuation marks have been removed and consequently don’t has become dont; it is no longer identical to the stopword don’t and, very logically, it has not been removed.
This scenario happened in 19 reviews and, without change, it would happen for all short forms that include an apostrophe.
In order to prevent that scenario from happening, there are simple solutions, e.g.:
An appropriate solution will be applied in the section Fine Tuning NLP.
Now, it is time we switched to another NLP flaw that is perceptible in the bag of words wordcloud above: words collapse.
Let’s have a look at the whole bag of words (obtained before applying the sparsity process).
# Collecting all tokens, upstream of the
# sparsity process, which the token "brokeni",
# which will be commented upon just below,
# couldn't pass since there is only one instance
# of "brokeni"!
tokens <-
findFreqTerms(dtm, lowfreq = 1) %>%
as.data.frame() %>%
`colnames<-`("Token from the Bag of Words")
# Instead of "findFreqTerms(dtm, lowfreq = 1)"
# we could also have used "colnames(dtm)" ...
# Building up an interactive presentation table.
datatable(tokens, rownames = FALSE, filter = "top",
options = list(width = "450px", pageLength = 10,
scrollX = F,
# Centers the single datatable column (column 0).
columnDefs = list(list(className = 'dt-center',
targets = 0)),
# Sets background color and font color in header.
initComplete = JS(
'function(settings, json) {',
'$(this.api().table().header()).css({
"background-color": "#A41034",
"color": "white"});',
'}'),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)First, there are several numbers. Should they be removed? That question will be dealt with in different ways below.
Second, some unigrams seem to originate from two words:
Let’s check whether e.g. brokeni originates in words collapse.
# We have to work on all tokens, upstream of
# the sparsity process, which the token "brokeni"
# couldn't pass since there is only one instance
# of "brokeni"! The corpus meets this requirement:
# it contains all tokens. Let's extract
# the row number generating "brokeni".
v <- 1:length(corpus)
for(i in v) {
v[i] <- length(grep("brokeni", corpus[[i]]$content))
}
# Second, retrieving the corresponding review
# and inserting it into a data frame.
df <-
data.frame(
reviews_training$ro[which(v >= 1)],
reviews_training$text[which(v >= 1)],
stringsAsFactors = FALSE) %>%
`colnames<-`(c("Row Number",
'Training Review Producing "brokeni"'))
# Layout of the table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, width = "2in") %>%
row_spec(1, bold = T, color = "white",
background = harvard_crimson) | Row Number | Training Review Producing “brokeni” |
|---|---|
| 381 | I got the car charger and not even after a week the charger was broken…I went to plug it in and it started smoking. |
What happened? Well, broken…I was first lowercased to broken…i, then punctuation was removed by the function removePunctuation(), which does not insert any empty space character, and broken…i has become brokeni.
This has to be corrected of course for brokeni but also for similar cases. In the next section Fine Tuning NLP, a general solution will be applied.
Instead of using the function removePunctuation() from the package tm, specific “for loops” will be developed, preprocessing reviews according to the needs stated above and in a stepwise way:
Among stopwords, short forms (contractions) need to be specifically treated. Additional needs of breakdown might also emerge. Starting from the stopword list delivered by the function stopwords(“english”) from the package tm, four CSV files will be produced.
These are the four files:
The 4 files have been uploaded to the GitHub repository https://github.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited . They are going to be downloaded now and integrated into NLP pre-processing.
Let’s rebuild the corpus, the bag of words and the interactive wordcloud (just hover over tokens to get the frequency of occurrence).
# Downloading the 4 files described above.
# Starting with positive short forms.
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/short_forms_pos.csv"
short_forms_pos <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
# Keeping only the second column; removing leading
# and trailing empty space characters as well as
# repeated inter-word empty space characters;
# vectorizing the resulting text data.
short_forms_pos <-
str_squish(short_forms_pos[, 2]) %>%
as.vector()
# Normalizing (among others, apostrophes).
short_forms_pos <-
sapply(short_forms_pos, utf8_normalize,
map_quote = TRUE)
# Going on with negative short forms.
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/short_forms_neg.csv"
short_forms_neg <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
short_forms_neg <-
str_squish(short_forms_neg[, 2]) %>%
as.vector()
# Normalizing (among others, apostrophes).
short_forms_neg <-
sapply(short_forms_neg, utf8_normalize,
map_quote = TRUE)
# Negational unigrams
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/negation.csv"
negation <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
negation <-
str_squish(negation[, 2]) %>%
as.vector()
# Remaining stopwords
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/stopwords_remaining.csv"
stopwords_remaining <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
stopwords_remaining <-
str_trim(stopwords_remaining[, 2]) %>%
as.vector()
rm(myfile)
# Creating and preprocessing corpus again.
corpus_av0 <-
VCorpus(VectorSource(reviews_training$text))
corpus_av0 <-
tm_map(corpus_av0, content_transformer(tolower))
# Replacing all punctuation marks other than apostrophes
# with empty space characters, instead of simply suppressing
# punctuation marks, not to risk collapsing two or more words
# into one. But keeping apostrophes to leave intact
# short forms such as "don't" and to be able to identify them
# as short forms and to discard them as such.
for (i in 1:nrow(reviews_training)) {
corpus_av0[[i]]$content <-
gsub("(?!')[[:punct:]]", " ",
corpus_av0[[i]]$content, perl = TRUE)
}
rm(i)
# Removing extra empty space characters (= removing
# all empty space characters except one in a sequence).
# Then removing short forms.
corpus_av0 <-
tm_map(corpus_av0, stripWhitespace)
corpus_av0 <-
tm_map(corpus_av0, removeWords, short_forms_neg)
corpus_av0 <-
tm_map(corpus_av0, removeWords, short_forms_pos)
# Replacing all remaining apostrophes with empty space
# characters (there might be other apostrophes
# than in short forms...).
for (i in 1:nrow(reviews_training)) {
corpus_av0[[i]]$content <-
gsub("[[:punct:]]", " ", corpus_av0[[i]]$content)
}
rm(i)
# Removing n-grams from other files.
corpus_av0 <- tm_map(corpus_av0, removeWords,
negation)
corpus_av0 <- tm_map(corpus_av0, removeWords,
stopwords_remaining)
# Stemming words.
corpus_av0 <- tm_map(corpus_av0, stemDocument)
# Removing numbers and extra empty space characters.
corpus_av0 <- tm_map(corpus_av0, removeNumbers)
corpus_av0 <- tm_map(corpus_av0, stripWhitespace)
# Building up a bag of words in a Document Term Matrix.
dtm_av0 <- DocumentTermMatrix(corpus_av0)
# Managing sparsity with the sparsity threshold.
sparse_av0 <- removeSparseTerms(dtm_av0, 0.995)
# Converting sparse_av0, which is a DocumentTermMatrix,
# to a matrix and then to a data frame.
sentSparse_av0 <-
as.data.frame(as.matrix(sparse_av0))
# Making all column names R-friendly.
colnames(sentSparse_av0) <-
make.names(colnames(sentSparse_av0))
# Let's check whether shortcomings have disappeared or not
# by building up a wordcloud with the most frequent tokens
# originating from the training reviews.
# Keeping only the 40 most frequent tokens.
df <-
data.frame(word = colnames(sentSparse_av0),
freq = colSums(sentSparse_av0)) %>%
filter(freq >= 10) %>%
arrange(desc(freq)) %>%
head(., 40)
# Building up wordcloud.
set.seed(1)
wordcloud2(df, shape = 'square',
color = 'random-light',
backgroundColor = greenish_blue,
shuffle = FALSE)In the wordcloud, there is no more token originating from short forms.
Let’s have a broader look, building up a presentation table and checking whether all abovementioned oddities have disappeared. Let’s check up in the bag of words whether dont has indeed disappeared.
# Retrieving all tokens, upstream of the sparsity process.
tokens <- findFreqTerms(dtm_av0, lowfreq = 1)
# Choosing the number of columns of the presentation table.
nc <- 5
# Calculating the number of missing tokens
# to have a full matrix.
mis <-
((ceiling(length(tokens) / nc)) * nc) - length(tokens)
# Building up the presentation table.
tokens <- as.character(c(tokens, (rep("-", mis))))
tokens <-
data.frame(matrix(tokens, ncol = nc, byrow = TRUE))
# Looking for "dont". I would automate the search
# if this were production deployment!
df <- tokens[51, ] %>%
as.data.frame() %>%
`colnames<-`(NULL) %>%
`rownames<-`(NULL)
# Layout of the presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1:5, bold = T, color = "white",
background = greenish_blue)| dissapoint | distort | distract | dit | dock |
Yes, indeed, dont has disappeared. Let’s check up in the same way for ive!
# Looking for "ive". I would automate the search
# if this were production deployment!
df <- tokens[96, ] %>%
as.data.frame() %>%
`colnames<-`(NULL) %>%
`rownames<-`(NULL)
# Layout of the presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1:5, bold = T, color = "white",
background = greenish_blue)| iphon | ipod | irda | issu | item |
“ive” has also disappeared. Now “brokeni”.
# Looking for "brokeni". I would automate the search
# if this were production deployment!
df <- tokens[21, ] %>%
as.data.frame() %>%
`colnames<-`(NULL) %>%
`rownames<-`(NULL)
# Layout of the presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1:5, bold = T, color = "white",
background = greenish_blue)| brand | break | breakag | brilliant | broke |
“brokeni” has vanished as well, just as many other oddities.
The next interactive datatable allows to check up for some other oddities having disappeared.
# Collecting all tokens upstream of the sparsity process,
# which the token "brokeni" couldn't pass
# since there is only one instance of "brokeni"!
tokens <-
findFreqTerms(dtm_av0, lowfreq = 1) %>%
as.data.frame() %>%
`colnames<-`("Token after NLP (but before Sparsity Process)")
# Instead of "findFreqTerms(dtm, lowfreq = 1)"
# we could also have used "colnames(dtm)" ...
# Building up interactive presentation table.
datatable(tokens, rownames = FALSE, filter = "top",
options = list(width = "450px",
pageLength = 10, scrollX = F,
# Centers the single datatable column (column 0).
columnDefs = list(list(className = 'dt-center',
targets = 0)),
# Sets background color and font color in header.
initComplete = JS(
'function(settings, json) {',
'$(this.api().table().header()).css({
"background-color": "#A41034",
"color": "white"});',
'}'),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)This interactive datatable allows us to search for other previously pinpointed oddities and to realize that they have indeed disappeared.
By entering tokens in the search box, we can once again easily check that “dont” and “ive” have indeed disappeared.
All short forms have also vanished from the bag of words.
The same holds for abovepretti, replaceeasi or unacceptableunless, which looked like the result from words collapse.
On the contrary, buyit has not vanished, because at least once it was written in that way in a review.This can easily be checked up by entering buyit in the interactive table above with Training Review in the header (interactive table on blue background color).
Numbers have disappeared.
I leave uncorrected some spelling errors, such as disapoint or dissapoint, because this is no repetitive structure and occurrence seems marginal.
After cleaning the bag of words through NLP, let’s have a first try at predicting sentiment by using tokens as predictors.
On the training reviews, sentiment polarity will be predicted using a standard machine learning model. The target variable, or dependent variable, is of course the sentiment polarity. The predictors will be all tokens from the bag of words produced by the NLP process. For each row (training review), each bag of words column (predictor) contains the occurrence frequency of the corresponding token in the training review.
The chosen machine learning model will be CART: it runs rather quickly and delivers clear decision trees.
We expect the following merits from predicting with CART on the bag of words from the training reviews:
From the baseline model, we would expect an accuracy level of 50 %, since each class (positive sentiment polarity or negative sentiment polarity) is 50 % of the training reviews as already shown.
Running CART, and more specifically the function rpart(), delivers the accuracy level mentioned hereunder.
# Adding dependent variable.
sentSparse_av0 <- sentSparse_av0 %>%
mutate(sentiment = reviews_training$sentiment)
# Training CART with the algorithm rpart.
set.seed(1)
fit_cart_av0 <-
rpart(sentiment ~., data = sentSparse_av0)
fitted_cart_av0 <-
predict(fit_cart_av0, type = "class")
cm_cart_av0 <-
confusionMatrix(fitted_cart_av0,
sentSparse_av0$sentiment)
# Extracting CART accuracy level and inserting it
# into a data frame.
df <- data.frame(round(cm_cart_av0$overall["Accuracy"], 4)) %>%
`rownames<-`("Model: CART") %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of the presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = greenish_blue) %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue)| Accuracy on the Training Set | |
|---|---|
| Model: CART | 0.768 |
Now let’s train the rpart method with the train() function from the metapackage caret.
By default, the train() function would train across 3 values of cp (the complexity parameter) and 25 bootstrapped resamples for each tuned value of cp. As far as the number of tuned values is concerned, let’s upgrade it to 15 to increase the odds of improving accuracy, especially as rpart runs rather quickly.
The default resampling method is bootstrapping, samples being built with replacement, with some reviews being picked up twice or more and some other reviews not being selected. This method seems especially appropriate here because the size of each resample will be the same of the size of the training set, which is already limited, i.e. 668. Working with e.g. K-fold cross-validation would imply further splitting the training set.
Will accuracy improve?
# Running rpart on the training set
# with train() from caret.
set.seed(1)
fit_cart_tuned_av0 <- train(sentiment ~ .,
method = "rpart",
data = sentSparse_av0,
tuneLength = 15,
metric = "Accuracy")
# Predicting on training set.
fitted_cart_tuned_av0 <-
predict(fit_cart_tuned_av0)
# Producing confusion matrix.
cm_cart_tuned_av0 <-
confusionMatrix(as.factor(fitted_cart_tuned_av0),
as.factor(sentSparse_av0$sentiment))
# Extracting accuracy level and inserting it
# into a data frame.
df <-
data.frame(round(cm_cart_tuned_av0$overall["Accuracy"], 4)) %>%
`rownames<-`("Model: CART + 15 cp Tuning Iterations") %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of the presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = greenish_blue) %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue)| Accuracy on the Training Set | |
|---|---|
| Model: CART + 15 cp Tuning Iterations | 0.7874 |
Accuracy increases from 77 % to 79 %. For the record, let’s have a look at a graph showing how accuracy evolves across the 15 cp values chosen by the train() function.
# Designing graph.
graph <-
ggplot(fit_cart_tuned_av0) +
geom_line(col = greenish_blue, size = 1) +
geom_point(col = harvard_crimson, size = 4) +
ggtitle("Average Bootstrap Accuracy across cp Values") +
xlab("Complexity Parameter") +
ylab("Average Accuracy (Bootstrap)") +
theme(plot.title = element_text(hjust = 0.5,
size = 16, face = "bold"),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
# Making graph interactive with ggplotly().
p <- ggplotly(graph, dynamicTicks = TRUE,
width = 800, height = 500 )
# Centering the graph, because the centering
# opts_chunk previously inserted is not operative
# in the case of the ggplotly() function.
htmltools::div(p, align = "center" )The optimal value of cp is zero. This means that the train() function has kept the decision tree as complex as possible by assigning a zero value to the complexity parameter. Would this be an insight that accuracy improvement could come from more complex models? This will be done later on.
On the graph above, maximum accuracy is only 73 %, as you can see when hovering over the highest dot. This is significantly lower than the level previously indicated, i.e. 79 %. Why is it different? Because, on the graph, it is, for each cp value, the average accuracy on the 25 bootstrapped resamples, while accuracy previously given related to the whole training set.
On the whole training set, the rpart model without tuning delivers approximately 77 % accuracy and the rpart model with tuning 79 %. Both levels are substantially higher than accuracy provided by the baseline model.
The baseline model would predict a positive evaluation for all training reviews (or alternatively a negative evaluation for all training reviews) since prevalence is 50 %. Prevalence, i.e. 50 %, should show in the accuracy level delivered by the baseline model on the training set. Let’s check it up.
# Document Term Matrix from training reviews,
# after Sparsity Process
df <- sentSparse_av0
# Data frame with 2 columns, one with positive
# sentiment polarity everywhere (baseline model)
# and one column with actual sentiment polarity
pred_baseline <-
data.frame(sentiment = rep(" Pos", nrow(df))) %>%
mutate(sentiment = factor(sentiment,
levels = levels(df$sentiment)))
# Confusion matrix
cm_baseline <-
confusionMatrix(pred_baseline$sentiment,
as.factor(df$sentiment))
# Presentation table of baseline model accuracy
df <-
data.frame(sprintf("%.4f",
round(cm_baseline$overall["Accuracy"], 4))) %>%
`colnames<-`("Accuracy on the Training Set") %>%
`rownames<-`("Model: Baseline")
# Layout of presentation table and printing
knitr::kable(df, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = greenish_blue) %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue)| Accuracy on the Training Set | |
|---|---|
| Model: Baseline | 0.5000 |
Let’s summarize results from the three models, not only with accuracy but also with additional performance metrics.
# Denominations of performance metrics
colname <-
c("MODEL ID", "SHORT DESCRIPTION", "ACCURACY",
"SENSITIVITY", "NEG PRED VAL",
"SPECIFICITY", "POS PRED VAL")
# Denominations of models
models <-
c("baseline", "cart_av0", "cart_tuned_av0")
# Short descriptions of models
description <-
c("Baseline Model", "CART", "CART + Tuning")
# Denominations of confusion matrices
# from the 3 models
cm <- c("cm_baseline", "cm_cart_av0",
"cm_cart_tuned_av0")
# Receptacle table for performance metrics
tab <-
data.frame(matrix(1:(length(colname) * length(models)),
ncol = length(colname),
nrow = length(models)) * 1)
# for loop collecting information
i <- 1
for (i in 1:length(models)) {
tab[i, 1] <- models[i]
tab[i, 2] <- description[i]
tab[i, 3] <-
eval(parse(text = paste(cm[i],
"$overall['Accuracy']", sep = "")))
tab[i, 4] <-
eval(parse(text = paste(cm[i],
"$byClass['Sensitivity']", sep = "")))
tab[i, 5] <-
eval(parse(text = paste(cm[i],
"$byClass['Neg Pred Value']", sep = "")))
tab[i, 6] <-
eval(parse(text = paste(cm[i],
"$byClass['Specificity']", sep = "")))
tab[i, 7] <-
eval(parse(text = paste(cm[i],
"$byClass['Pos Pred Value']", sep = "")))
}
# Neg Pred Val is indeterminate for the baseline model
# since it is the result from a division by zero.
# Let's first assign a fake value to the Neg Pred Val
# from the baseline model in order to easily round all columns.
tab[1, 5] <- 0
tab_av0 <-
tab %>%
mutate_at(vars(3:7), funs(round(., 4))) %>%
`colnames<-`(colname)
# Indicating true nature of result of Neg Pred Val
# for baseline model.
tab_av0[1, 5] <- "Div. by 0"
# Layout of presentation table and printing
knitr::kable(tab_av0, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(1, bold = T, strikeout = T, color = "white",
background = harvard_crimson) %>%
row_spec(2, bold = T, color = greenish_blue,
background = super_light_taupe) %>%
row_spec(3, bold = T, color = "white",
background = greenish_blue)| MODEL ID | SHORT DESCRIPTION | ACCURACY | SENSITIVITY | NEG PRED VAL | SPECIFICITY | POS PRED VAL |
|---|---|---|---|---|---|---|
| baseline | Baseline Model | 0.5000 | 1.0000 | Div. by 0 | 0.0000 | 0.5000 |
| cart_av0 | CART | 0.7680 | 0.6407 | 0.7136 | 0.8952 | 0.8594 |
| cart_tuned_av0 | CART + Tuning | 0.7874 | 0.7365 | 0.7609 | 0.8383 | 0.8200 |
In the table above, on row 1, fonts have been stricken through to indicate that this model is discarded because if delivers only 50 % accuracy and looks like a dead-end path.
The other two models should be seen as a cumulative process bringing accuracy improvement in a stepwise and incremental way, CART with tuning delivering the best accuracy level. Models 2 and 3 deliver higher accuracy but also asymmetry between other performance metrics: sensitivity and negative predictive value are lower than specificity and positive predictive value. This reflects false negatives being more numerous than false positives. False negatives are predictions pointing to “Neg” while the actual value is " Pos". This is an insight for text mining, pointing to perusing false negatives and coming with actionable findings.
In order to confirm that false negatives are more numerous than false positives, let’s have a look at the confusion matrix for both models. First, the confusion matrix from the rpart model without tuning.
# Metric abbreviations in confusion matrices
name <- c("TP = ", "FN = ", "FP = ", "TN = ")
# Building up confusion matrix data in vector format.
tab <-
table(fitted_cart_av0, sentSparse_av0$sentiment) %>%
as.vector() %>%
paste(name, ., sep = "")
# Ordering data in confusion matrix format
# and inserting headers in the confusion matrix.
tab <-
data.frame(matrix(tab, ncol = 2, nrow = 2,
byrow = FALSE)) %>%
`colnames<-`(c("Actually positive",
"Actually negative")) %>%
`rownames<-`(c("Predicted positive with CART",
"Predicted negative with CART"))
# Layout of presentation table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = "black") %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue) %>%
column_spec(3, bold = T, color = "white",
background = harvard_crimson)| Actually positive | Actually negative | |
|---|---|---|
| Predicted positive with CART | TP = 214 | FP = 35 |
| Predicted negative with CART | FN = 120 | TN = 299 |
Indeed, the weak point lies in the first column, on greenish blue background: the relatively high number of false negatives and, as a corollary, the relatively low number of true positives. On the reference positive class (" Pos" in label or dependent variable), predicting seems problematic or at the very least challenging since false negatives are rife. On the contrary, on the reference negative class (“Neg” in label), predicting has run smoothly, with a rather satisfactorily low number of false positives.
The tuned rpart model is expected to slightly reduce the excess in false negatives.
# Metric abbreviations in confusion matrices
name <- c("TP = ", "FN = ", "FP = ", "TN = ")
# Building up confusion matrix data in vector format.
tab <- table(fitted_cart_tuned_av0,
sentSparse_av0$sentiment) %>%
as.vector() %>%
paste(name, ., sep = "")
# Ordering data in confusion matrix format
# and inserting headers in the confusion matrix.
tab <-
data.frame(matrix(tab, ncol = 2, nrow = 2,
byrow = FALSE)) %>%
`colnames<-`(c("Actually positive",
"Actually negative")) %>%
`rownames<-`(c("Predicted positive with CART + tuning",
"Predicted negative with CART + tuning"))
# Layout of presentation table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = "black") %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue) %>%
column_spec(3, bold = T, color = "white",
background = harvard_crimson)| Actually positive | Actually negative | |
|---|---|---|
| Predicted positive with CART + tuning | TP = 246 | FP = 54 |
| Predicted negative with CART + tuning | FN = 88 | TN = 280 |
With the tuned rpart model, accuracy has slightly improved: the sum of numbers on the main diagonal is larger.
On the greenish blue background, predicting on the reference positive class is less prolific in false negatives and, as a corollary, true positives are more predominant.
On the secondary diagonal, imbalance between false negatives and false positives is less marked, not only because there are less false negatives but also because there are more false positives. Nevertheless false negatives remain the weak point, being twice as numerous as false positives.
False negatives - and false positives - will be perused through text mining in the next section, looking for new insights towards accuracy improvement.
In this section, we are going to peruse the training reviews leading to false negatives or false positives produced by the CART model with cp tuning. This will be done with a view to pinpointing words, expressions, or phrases whose sentiment polarity could be flipped to better predict.
Another question will be raised: should topic-related words and tokens be maintained in the bag of words? Could they have any predictive impact?
Let’s first build an interactive table with all training reviews leading to false negatives or false positives with the CART model with cp tuning. Let’s start with false negatives, because there are more false negatives.
# To identify false negatives, we need both
# the actual review polarity and the predicted
# review polarity. Consequently, we are going
# to combine both variables in one data frame.
df <-
data.frame(sentiment = reviews_training$sentiment,
pred = fitted_cart_tuned_av0)
# We have a false negative if actual review polarity
# is positive and if predicted review polarity is
# negative. If actual review polarity is positive,
# then the first line command below produces 1;
# if predicted review polarity is negative,
# the second line command below produces 0;
# consequently, for a false negative, the
# global result will be 1.
# (The global result for a false positive is - 1,
# for a true positive or a true negative it is 0.)
# So, 1 corresponds to what we are looking for,
# i.e. false negatives.
FN_train <- ifelse(df$sentiment == " Pos", 1, 0) -
ifelse(df$pred == " Pos", 1, 0)
# Now, we have to generate a dichotomic vector
# with one specific value for false negatives
# and another specific value for all other cases
# (false positives, true positives or true negatives).
# That's exactly what the next command does. Indeed,
# if the command above gives 1 (false negative),
# then the command below delivers 1 as well
# while delivering 0 in all other cases
# (false positives, true positives or true negatives).
FN_train <- ifelse(FN_train == 1, 1, 0)
# Row numbers corresponding to false negatives
FN <- which(FN_train == 1)
# Now let's build up an interactive table
# with all false negatives delivered by CART
# with cp tuning.
# First, let's create a receptacle data frame.
df_fn <-
data.frame(row = FN,
review = as.character(1:length(FN)),
tokenized = as.character(1:length(FN))) %>%
`colnames<-`(c("Row",
"Training Review Leading to False Negative",
"Tokenized"))
# In order to populate the receptacle data frame, let's
# build up a for loop that collects data, i.e. row
# number, training review and tokenized training review.
for (i in 1:length(FN)) {
row <- FN[i]
df_fn[i, 2] <- reviews_training$text[row]
df_fn[i, 3] <- corpus_av0[[row]]$content
}
rm(i, row)
# Converting row numbers to characters in order ...
# to have them left-aligned in the interactive
# data table below.
df_fn <- df_fn %>%
mutate(Row = as.character(Row))
# Creating the interactive data table, using the DT package.
datatable(df_fn, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#a41034',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)In the interactive table above, if we scroll through false negatives, several scenarios appear. Let’s classify false negatives into four scenarios, identified by the pivotal pieces of information that were not used by CART to produce the right polarity, i.e. the positive polarity:
When consulting the table of false negatives above, we can pinpoint some subjective information unigrams unused, i.e. words/tokens encompassing some subjective information that points to the right polarity.
These words, and the corresponding standardized tokens, could be classified in several categories. Here, it has been opted for three main categories:
The first category is sentiment-related and so is the second category in most cases and to some degree. The third category relates to technicalities but without quantification. The three categories can be deemed as compliance-related, expressing to some degree compliance with expectations, requirements or advertisements.
To sum it up, the three categories will be referred to altogether in this project using phrases such as subjective information or words conveying subjective information or tokens conveying subjective information.
That subjective information is readily readable from a human point of view. But, in spite of these words/tokens, polarity has been wrongly read by CART. Why?
Maybe because these words/tokens are not present in the final decision tree? Or maybe because other words/tokens have precedence in the decision tree?
Let’s have a look at the final decision tree delivered by CART with cp tuning.
# Defining a color palette.
palette <- c(super_light_gray, super_light_taupe)
# Building up decision tree.
tree <- prp(fit_cart_tuned_av0$finalModel,
uniform = TRUE, cex = 0.8,
box.palette = palette,
border.col = "white") Unfortunately, the tokens pinpointed among the false negatives do not show in the decision tree.
What types of tokens can be seen in the decision tree?
There is a majority of tokens conveying subjective information, such as great, love, comfort, like, and disappoi, even if like can be ambiguous because it can be the preposition and not the verb.
There are also other types of tokens, but at a lower level: - intent-related token (buy) or - topic-related tokens (car).
Which is an interesting insight. In CART, tokens conveying subjective information predominate, which is not at all surprising! This points to solutions allocating higher priority to tokens conveying subjective information.
Although a majority of tokens are conveying subjective information in the decision tree, we do not find many tokens with subjective information that we have pinpointed among false negatives. It can be a matter of word (or token) frequency: maybe some tokens with subjective information that we have pinpointed among false negatives only have a rather low frequency of occurrence and maybe it is the reason why they do not show in the decision tree. This can be first checked up in the wordcloud that has already been visualized. The wordcloud is only comprised of tokens with at least 10 as frequency of occurrence: will some subjective information tokens from false negatives show up in the wordcloud?
# Getting the bag of words without
# an irrelevant column.
df <- sentSparse_av0[, - ncol(sentSparse_av0)]
# Building up a vector with the 40 most
# frequent tokens in the bag of words.
temp <-
data.frame(word = colnames(df),
freq = colSums(df)) %>%
filter(freq >= 10) %>%
arrange(desc(freq)) %>%
head(., 40)
# Creating an interactive wordcloud.
set.seed(1)
wordcloud2(temp, shape = "square",
color = "random-light",
backgroundColor = greenish_blue,
shuffle = FALSE)For illustrative purposes, tokens can be visualized in decreasing order of frequency in the interactive histogram below.
# Preparing the histogram.
graph <- temp %>%
mutate(word = reorder(word, freq)) %>%
ggplot(aes(word, freq)) +
geom_bar(stat = "identity", width = 0.80,
color = "#007ba7", fill = "#007ba7") +
coord_flip() +
ggtitle("Token Frequency") +
xlab("Token") +
ylab("Frequency") +
theme(plot.title = element_text(hjust = 0.5,
size = 16,
face = "bold"),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(angle = 45,
hjust = 1,
size = 12),
axis.text.y = element_text(size = 12))
# Making the graph interactive.
p <- ggplotly(graph, dynamicTicks = TRUE,
width = 500, height = 1000)
# Centering the interactive graph.
htmltools::div(p, align = "center" )Tokens depicted in the wordcloud and in the histogram can be
Most decision tree tokens appear in the wordcloud (and in the histogram). To check it up, let’s compute the proportion of decision tree tokens appearing in the wordcloud and in the histogram.
# Collecting decision tree tokens
# in a character vector.
tree_tokens <- tree$obj$frame$var
tree_tokens <-
tree_tokens[!tree_tokens == "<leaf>"]
# Collecting wordcloud tokens. They have already
# been stored in the data frame "temp", and in particular
# in the column whose name is "word".
wordcloud_tokens <- temp$word
# Extracting tree tokens that also appear
# in the wordcloud and in the histogram.
intersection <-
intersect(tree_tokens, wordcloud_tokens)
# Computing proportion of decision tree tokens
# appearing in the wordcloud and in the histogram.
prop <-
length(intersection) * 100 / length(tree_tokens)
prop <- round(prop, 0)
prop <- paste(prop, "%", sep = " ")
# Building up a presentation data frame
# for the proportion.
tab <-
data.frame(prop) %>%
`colnames<-`("Proportion of Tree Tokens Appearing in the Wordcloud")
# Layout of the presentation table and printing.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = "white",
background = greenish_blue) | Proportion of Tree Tokens Appearing in the Wordcloud |
|---|
| 70 % |
70 % of decision tree tokens are in the wordcloud, i.e. they are among the 40 most frequent tokens.
But token frequency is not enough to enter the decision tree: tokens need discriminant predictive power. So, phone is the wordcloud token with the highest frequency – 116 occurrences – but the decision tree is not comprised of phone; the reason of it seems obvious. On the contrary, great only has 69 occurrences and appears on top of the decision tree.
We can better visualize this when looking at some rpart output.
# class.output = "bg-primary"
# customizes the style of the decision tree
# and dramatically improves readability,
# giving white color font and blue background.
tree$obj## n= 668
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 668 334 Pos (0.50000000 0.50000000)
## 2) great>=0.5 68 4 Pos (0.94117647 0.05882353) *
## 3) great< 0.5 600 270 Neg (0.45000000 0.55000000)
## 6) good>=0.5 45 7 Pos (0.84444444 0.15555556) *
## 7) good< 0.5 555 232 Neg (0.41801802 0.58198198)
## 14) love>=0.5 16 0 Pos (1.00000000 0.00000000) *
## 15) love< 0.5 539 216 Neg (0.40074212 0.59925788)
## 30) excel>=0.5 17 1 Pos (0.94117647 0.05882353) *
## 31) excel< 0.5 522 200 Neg (0.38314176 0.61685824)
## 62) nice>=0.5 12 0 Pos (1.00000000 0.00000000) *
## 63) nice< 0.5 510 188 Neg (0.36862745 0.63137255)
## 126) best>=0.5 16 2 Pos (0.87500000 0.12500000) *
## 127) best< 0.5 494 174 Neg (0.35222672 0.64777328)
## 254) comfort>=0.5 10 1 Pos (0.90000000 0.10000000) *
## 255) comfort< 0.5 484 165 Neg (0.34090909 0.65909091)
## 510) well>=0.5 16 4 Pos (0.75000000 0.25000000) *
## 511) well< 0.5 468 153 Neg (0.32692308 0.67307692)
## 1022) recommend>=0.5 17 5 Pos (0.70588235 0.29411765) *
## 1023) recommend< 0.5 451 141 Neg (0.31263858 0.68736142)
## 2046) better>=0.5 14 4 Pos (0.71428571 0.28571429) *
## 2047) better< 0.5 437 131 Neg (0.29977117 0.70022883)
## 4094) like>=0.5 18 7 Pos (0.61111111 0.38888889) *
## 4095) like< 0.5 419 120 Neg (0.28639618 0.71360382)
## 8190) ani>=0.5 9 3 Pos (0.66666667 0.33333333) *
## 8191) ani< 0.5 410 114 Neg (0.27804878 0.72195122)
## 16382) happi>=0.5 7 2 Pos (0.71428571 0.28571429) *
## 16383) happi< 0.5 403 109 Neg (0.27047146 0.72952854)
## 32766) just>=0.5 10 4 Pos (0.60000000 0.40000000) *
## 32767) just< 0.5 393 103 Neg (0.26208651 0.73791349)
## 65534) disappoint< 0.5 379 103 Neg (0.27176781 0.72823219)
## 131068) money< 0.5 366 103 Neg (0.28142077 0.71857923)
## 262136) first< 0.5 355 103 Neg (0.29014085 0.70985915)
## 524272) drop< 0.5 346 103 Neg (0.29768786 0.70231214)
## 1048544) bad< 0.5 338 103 Neg (0.30473373 0.69526627)
## 2097088) poor< 0.5 330 103 Neg (0.31212121 0.68787879)
## 4194176) terribl< 0.5 322 103 Neg (0.31987578 0.68012422)
## 8388352) car>=0.5 11 5 Pos (0.54545455 0.45454545) *
## 8388353) car< 0.5 311 97 Neg (0.31189711 0.68810289)
## 16776706) but< 0.5 297 96 Neg (0.32323232 0.67676768)
## 33553412) qualiti>=0.5 7 3 Pos (0.57142857 0.42857143) *
## 33553413) qualiti< 0.5 290 92 Neg (0.31724138 0.68275862)
## 67106826) product< 0.5 280 91 Neg (0.32500000 0.67500000)
## 134213652) phone< 0.5 241 82 Neg (0.34024896 0.65975104)
## 268427304) all>=0.5 7 2 Pos (0.71428571 0.28571429) *
## 268427305) all< 0.5 234 77 Neg (0.32905983 0.67094017) *
## 134213653) phone>=0.5 39 9 Neg (0.23076923 0.76923077) *
## 67106827) product>=0.5 10 1 Neg (0.10000000 0.90000000) *
## 16776707) but>=0.5 14 1 Neg (0.07142857 0.92857143) *
## 4194177) terribl>=0.5 8 0 Neg (0.00000000 1.00000000) *
## 2097089) poor>=0.5 8 0 Neg (0.00000000 1.00000000) *
## 1048545) bad>=0.5 8 0 Neg (0.00000000 1.00000000) *
## 524273) drop>=0.5 9 0 Neg (0.00000000 1.00000000) *
## 262137) first>=0.5 11 0 Neg (0.00000000 1.00000000) *
## 131069) money>=0.5 13 0 Neg (0.00000000 1.00000000) *
## 65535) disappoint>=0.5 14 0 Neg (0.00000000 1.00000000) *
We can see the rationale of the decision tree. great arrives on top, with presence in 68 training reviews – we saw in the interactive wordcloud and histogram that the word (token) frequency was in fact 69 so there must be a review with twice the word (token) great. great is present in 64 reviews with positive sentiment polarity and only in 4 reviews with negative sentiment polarity. It is powerfully discriminant.
The second token in the decision tree is good, present in 45 reviews, of which 38 reviews with positive sentiment polarity.
comfort is present in 10 training reviews, of which only 1 is negative. It comes before recommend with presence in 17 training reviews but 5 of them have negative sentiment polarity.
Below comfort we also find like with presence in 18 training reviews but 7 of them have negative sentiment polarity.
The ranking shows that not only frequency matters but also discriminant predictive power.
Now, it is time we went back to false negatives containing positive subjective information words (tokens) that have not been used to rightly predict positive sentiment polarity. Why are they not in the decision tree? Because frequency is too low or because discriminant predictive power is too low? Frequency and discriminant predictive power could be calculated for subjective information tokens that are present in false negatives but not in the decision tree. But that is CART’s job, isn’t it?
Actually, we are going to switch from statistics to linguistics. Let’s see whether the already pinpointed tokens with positive subjective information could indeed flip sentiment polarity and help avoid some false negatives. These are the words we will look for in training reviews leading to false negatives:
# Patterns we are looking for among false negatives
patterns <-
c("glad", "impressed", "joy", "awesome", "fine",
"rocks", "fast", "prompt", "sturdy")
# Collapsing all words, with the
# operator | between words.
patterns <- paste(patterns, collapse = "|")
# A data frame with the false negatives has
# already been created. It has been called
# df_fn. Let's keep only rows with at least
# one of the words contained in "patterns".
# Let's filter.
df <- df_fn %>%
filter(str_detect(`Training Review Leading to False Negative`,
patterns) == TRUE) %>%
`colnames<-`(c("Row",
"False Negative with Positive Information",
"Tokenized"))
# Creating the interactive data table, using the DT package.
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#a41034',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)From a human point of view, sentiment polarity of the training reviews above is clear. But even from a machine learning point of view, in almost all cases, replacing these positively polarized words with a clearly positive predictor would flip the predicted sentiment polarity of the reviews to the right polarity.
Why couldn’t CART do it?
We can think of two possible reasons (hinted at above): on the one hand, maybe these words were also present in numerous reviews with actual negative sentiment polarity; on the other hand, these words do not show up in the wordcloud, which means their occurrence frequency is at best not high and at worst very limited.
Acting on the first reason could maybe be done by choosing a more complex algorithm than rpart, which could possibly take much more predictors into account. This will not be done at this stage because a more effective algorithm can have a tendency to stick to data, even to outliers, on the training set and to be somewhat disappointing on the validation set (overfitting). This might camouflage problems.
It would be possible to act on the second reason in another way: regrouping, in one way or another, the words (tokens) containing positive subjective information might be an avenue of research. In that way, even words with low frequency could have their say.
This looks like an interesting insight.
In conclusion, it might be impactful to garner subjective information conveyed by tokens such as glad, impressed, joy, awesome, fine, rocks, fast, prompt, and sturdy. Since CART doesn’t do it, why not replace such tokens with a generic positive token? This would empower subjective information by building high frequency generic tokens only typified by sentiment orientation.
In this project, words and tokens conveying positive subjective information will be inserted in additional files. That is one avenue of improvement that will be investigated in the section Predicting after TM (Text Mining).
In a similar way, negative subjective information can also impact sentiment polarity. Could some words/tokens with negative subjective information flip some false positives to the right polarity? Let’s have a look at training reviews leading to false positives with the tuned rpart model.
# To identify false positives, we need both
# the actual review polarity and the predicted
# review polarity. Consequently, we are going
# to combine both variables in one data frame.
df <-
data.frame(sentiment = reviews_training$sentiment,
pred = fitted_cart_tuned_av0)
# We have a false negative if actual review
# polarity is positive and if predicted review
# polarity is negative.
# If CART delivers a false negative for
# a specific row, then the next command below
# produces 1; if it is a false positive,
# the result is -1; a true positive or
# a true negative gives 0.
# So, 1 corresponds to what we are looking for,
# i.e. false negatives, -1 corresponds to
# false positives and 0 corresponds to either
# true positives or true negatives.
FP_train <-
ifelse(df$sentiment == "Neg", 1, 0) -
ifelse(df$pred == "Neg", 1, 0)
# Now, we have to generate a dichotomic vector
# with one specific value for false negatives
# and another specific value for all other cases
# (false positives, true positives or true negatives).
# That's exactly what the next command does.
# Indeed, if the command above gives 1 (false negative),
# then the command below delivers 1 as well
# while delivering 0 in all other cases
# (false positives, true positives or true negatives).
FP_train <- ifelse(FP_train == 1, 1, 0)
# Row numbers corresponding to false negatives
FP <- which(FP_train == 1)
# Now let's build up an interactive table
# with all false negatives delivered by CART
# with cp tuning.
# Let's first create a receptacle data frame.
df_fp <-
data.frame(row = FP,
review = as.character(1:length(FP)),
tokenized = as.character(1:length(FP))) %>%
`colnames<-`(c("Row",
"False Positive with Negative Information",
"Tokenized"))
# In order to populate the receptacle data frame,
# let's build up a for loop garnering data,
# i.e. row number, training review and
# tokenized training review.
for (i in 1:length(FP)) {
row <- FP[i]
df_fp[i, 2] <- reviews_training$text[row]
df_fp[i, 3] <- corpus_av0[[row]]$content
}
rm(i, row)
# Converting row numbers to characters in order ...
# to have them left-aligned in the interactive
# data table below.
df_fp <-
df_fp %>%
mutate(Row = as.character(Row))
# Creating the interactive data table,
# using the DT package.
datatable(df_fp, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#e62900',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#ffb680";','}',
'}')
)
)Some training reviews leading to false positives contain subjective information that could flip sentiment polarity to positiveness. Here are a few examples with the words unusable, embarrassing, and unreliable, or the corresponding tokens usus, embarrass, or unreli.
Consequently some words and tokens with negative subjective information will be inserted into additional files.
Moreover, perusing false positives leads to another statement: sentiment polarity is often flipped by negation. Let’s switch now to negation.
Another category of words (tokens) can also flip sentiment polarity: negational unigrams, or, simplier, negation, just as not or no. Among false negatives and false positives, we could notice some occurrences of negation that flipped sentiment polarity but that could obviously not be taken into account by CART since these negational unigrams were considered as stopwords and had, for that reason, been discarded from tokens.
Let’s start with false positives containing negation.
# Let's determine the patterns we are looking
# for in false positives. Since most reviews
# are capitalized, variants have been provided
# with capitalization. Around some negational
# unigrams, there is a leading empty space
# character and/or a trailing empty space
# character in order to avoid picking up
# longer unigrams containing some negational
# unigrams, e.g. "notice" instead of "no" or
# instead of "not".
patterns <-
c("neither", "Neither", "never", "Never",
" no ", "No ", "none", "None", " nor ",
" not ", "Not ", "nothing", "Nothing")
# Collapsing all words, with the
# operator | between words.
patterns <- paste(patterns, collapse = "|")
# A data frame with the false positives has
# already been created. It has been called
# df_fp. Column names are "Row",
# "Training Review", and "Tokenized". By
# filtering, let's keep only rows with at
# least one word contained in "patterns".
df <- df_fp %>%
filter(str_detect(`False Positive with Negative Information`,
patterns) == TRUE) %>%
`colnames<-`(c("Row", "False Positive with Negation", "Tokenized"))
# Creating the interactive data table,
# using the DT package.
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#e62900',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#ffb680";','}',
'}')
)
)As the table above clearly shows it, negation is indeed essential: it often flips sentiment polarity.
Now, let’s have a look at false negatives with negation.
# Let's determine the patterns we are looking
# for in false negatives. Since most reviews
# are capitalized, variants have been provided
# with capitalization. Around some negational
# unigrams, there is a leading empty space
# character and/or a trailing empty space
# character in order to avoid picking up
# longer unigrams containing some negational
# unigrams, e.g. "notice" instead of "no" or
# instead of "not".
patterns <-
c("neither", "Neither", "never", "Never",
" no ", "No ", "none", "None", " nor ",
" not ", "Not ", "nothing", "Nothing")
# Collapsing all words, with the
# operator | between words.
patterns <- paste(patterns, collapse = "|")
# A data frame with the false negatives has
# already been created. It has been called
# df_fn. Column names are "Row",
# "Training Review", and "Tokenized". By
# filtering, let's keep only rows with at
# least one word contained in "patterns".
df <- df_fn %>%
filter(str_detect(`Training Review Leading to False Negative`,
patterns) == TRUE) %>%
`colnames<-`(c("Row", "False Negative with Negation", "Tokenized"))
# Creating the interactive data table,
# using the DT package.
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#a41034',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)The bigram no problem is clear from a human point of view but this bigram has become problem, the negational token no having been removed with all other stopwords. Even if problem is polarized under a generic negative token, as suggested above, the right polarity of no problem wouldn’t show.
Numerous avenues are opened up. Let’s draw three of them:
Actually, the number of training reviews with not or no is limited among false negatives. But it was much higher among false positives. Moreover, frequency could also be higher in the validation set, which we do not know. Keeping negational unigrams will be given a try in … Furthermore, negational multigrams will be associated, e.g. not so, no more, etc.
Up to now, negation has been shown in negational n-grams but negation can also be encapsulated into negative short forms.
Negation can be expressed by negative short forms — also called contractions) — such as isn’t.
Among false positives, there are many negative short forms that flip sentiment polarity to negativeness. There are also some among false negatives.
Consequently, keeping negative short forms will also be given a try, in paralell with negational n-grams.
Sentiment can also be expressed through associations of words, beyond the case of negational multigrams, which has already be treated.
In some cases, these are rather stereotyped phrases. Let’s have a look at a few training reviews leading to false negatives but containing multigrams whose consideration could flip sentiment polarity prediction to the right status, i.e. positive sentiment polarity.
# Patterns we are looking for in false negatives
patterns <-
c("a bargain", "a winner", "any problem",
"Five star ", "must have", "no problem",
"thumbs up", "Whoa", "whoa")
# Collapsing all words, with the
# operator | between words.
patterns <- paste(patterns, collapse = "|")
# A data frame with the false negatives has already
# been created. It has been called df_fn. Column names
# are "Row", "Training Review", and "Tokenized". Let's
# keep only rows with at least one word from "patterns".
df <- df_fn %>%
filter(str_detect(`Training Review Leading to False Negative`,
patterns) == TRUE) %>%
`colnames<-`(c("Row",
"False Negative with Positive Multigram",
"Tokenized"))
# Creating the interactive data table,
# using the DT package.
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#a41034',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#d6c0b0";','}',
'}')
)
)In the table above, we can see 8 examples of false negatives containing positive multigrams that can flip sentiment polarity to positiveness. These are only a few examples.
Positive and negative multigrams will be given a try, just as positive and negative unigrams.
Some reviews have delivered some more difficult cases. Here are a few examples among false positives.
# A data frame with the false positives has
# already been created. It has been called df_fp.
# Column names are "Row", "Training Review", and
# "Tokenized". Let's keep only rows with at least
# one word from "patterns".
# Let's pick up 5 rows with figurative usage,
# slang, sarcasm or multifaceted wording.
df <- df_fp %>%
filter(Row %in% c(54, 111, 113, 433, 586)) %>%
`colnames<-`(c("Row",
"False Positive with Contextual Impact",
"Tokenized"))
# Creating the interactive data table,
# using the DT package.
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#e62900',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#ffb680";','}',
'}')
)
)There can be figurative wording, sarcasm, irony, metaphors, multifaceted reviews, etc.
The table above gives five examples of more complex wording:
Some metaphors and some slang can enter additional files just as ordinary words. Of course, rpart can handle a word like crawl used as a metaphor or slang such as crap … if frequency is high enough and if usage has significantly statistical polarization. But among the training reviews, the occurrence frequency of these words is very low; knowing that the frequency among the training reviews selected at random is very limited, the conditional probability of having them in the validation set is also very limited; but including them into an additional file of subjective information (in these cases with negative sentiment polarity) can only be harmless if not useful.
Sarcasm is out of reach in this working paper.
Multifaceted wording might also be out of reach, although a more sophisticated machine learning model might perform a little bit better.
A very simple trick will get a try, with a view to better tackling multifaceted wording. In case of multifaceted wording, the word but often indicates restriction and is often the dominant meaning; in my humble opinion, it often introduces some dominant negative meaning.
Let’s check up that but is most of the time introducing impactful negative information. We’ll do that by collecting all training reviews containing the word but.
# Building up data frame.
tab <- reviews_training %>%
filter(str_detect(text, "but ") == TRUE) %>%
`colnames<-`(c("Row Number",
'Training Review Containing the Word "but"',
"Sentiment"))
# Building up interactive presentation table.
datatable(tab, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T,
# Setting background color and font color
# in header.
initComplete = JS(
'function(settings, json) {',
'$(this.api().table().header()).css({
"background-color": "#507786",
"color": "white"});',
'}'),
# Setting background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "powderblue";','}',
'}')
)
)Using the interactive table above, we can easily notice that 19 reviews containing but, out of 28, have actual negative sentiment polarity.
Consequently, we are going to remove but from the stopwords and let it stand on its own as a token in the bag of words. The algorithm rpart will handle it according to probabilities.
In text mining, insights have been obtained
Among insights, let’s mention:
In the next section, these text mining insights will be tentatively transposed into NLP and machine learning actions towards more accuracy.
Three avenues of improvement have been opened up:
Stepwise, the three avenues will be quantitatively tested.
The whole research has been performed only on training reviews without any kind of intermixture with validation reviews.
Several avenues of improvement have been drawn in the previous section through text mining. Each avenue will now be followed and its merits will be evaluated on the basis of prediction accuracy.
Negational n-grams have been introduced, except for negative short forms, NLP has been rerun as well as the CART model with tuning, which is used as a performance yardstick. Here are the results.
# Building up new corpus.
corpus_av1_a <-
VCorpus(VectorSource(reviews_training$text))
corpus_av1_a <-
tm_map(corpus_av1_a, content_transformer(tolower))
# Replacing all punctuation marks with empty space
# characters, instead of just removing punctuation marks,
# to prevent tokens like "brokeni" from being generated.
# Keeping apostrophes to leave intact short forms
# such as "don't" so that they can be removed as stopwords.
for (i in 1:nrow(reviews_training)) {
corpus_av1_a[[i]]$content <-
gsub("(?!')[[:punct:]]", " ",
corpus_av1_a[[i]]$content, perl = TRUE)
}
rm(i)
# Removing short forms after regulating empty space
# characters.
corpus_av1_a <-
tm_map(corpus_av1_a, stripWhitespace)
corpus_av1_a <-
tm_map(corpus_av1_a, removeWords, short_forms_neg)
corpus_av1_a <-
tm_map(corpus_av1_a, removeWords, short_forms_pos)
# Removing remaining apostrophes (there can be
# apostrophes outside of short forms).
for (i in 1:nrow(reviews_training)) {
corpus_av1_a[[i]]$content <-
gsub("[[:punct:]]", " ",
corpus_av1_a[[i]]$content)
}
rm(i)
# Removing stopwords_remaining, stemming,
# removing numbers, digits and multiple
# empty space characters (leaving only
# one empty space character at a time).
corpus_av1_a <-
tm_map(corpus_av1_a, removeWords,
stopwords_remaining)
corpus_av1_a <-
tm_map(corpus_av1_a, stemDocument)
corpus_av1_a <-
tm_map(corpus_av1_a, removeNumbers)
corpus_av1_a <-
tm_map(corpus_av1_a, stripWhitespace)
# Building bag of words, managing sparsity threshold,
# converting to data frame, regularizing column names
# and adding dependent variable.
dtm_av1_a <- DocumentTermMatrix(corpus_av1_a)
sparse_av1_a <- removeSparseTerms(dtm_av1_a, 0.995)
sentSparse_av1_a <-
as.data.frame(as.matrix(sparse_av1_a))
colnames(sentSparse_av1_a) <-
make.names(colnames(sentSparse_av1_a))
sentSparse_av1_a <- sentSparse_av1_a %>%
mutate(sentiment = reviews_training$sentiment)
# Training CART with the algorithm rpart with cp tuning.
set.seed(1)
fit_cart_tuned_av1_a <- train(sentiment ~ .,
method = "rpart",
data = sentSparse_av1_a,
tuneLength = 15,
metric = "Accuracy")
# Predicting on the training set.
fitted_cart_tuned_av1_a <-
predict(fit_cart_tuned_av1_a)
# Producing the confusion matrix on the training set.
cm_cart_tuned_av1_a <-
confusionMatrix(as.factor(fitted_cart_tuned_av1_a),
as.factor(sentSparse_av1_a$sentiment))
# Table comprised of accuracy
tab <-
data.frame(cm_cart_tuned_av1_a$overall["Accuracy"]) %>%
`rownames<-`("Model: Neg Short Forms + CART + Tuning") %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of the table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = greenish_blue) %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue)| Accuracy on the Training Set | |
|---|---|
| Model: Neg Short Forms + CART + Tuning | 0.7949102 |
There is accuracy improvement, modest though.
Consequently, this option will be maintained. All negational n-grams contained in negation.csv will be kept entirely in the corpus and thus in the bag of words, because the list of stopwords has been appropriately shortened.
For the record, does not show now in the decision tree?
palette <- c(super_light_gray, super_light_taupe)
prp(fit_cart_tuned_av1_a$finalModel,
uniform = TRUE, cex = 0.8,
box.palette = palette, border.col = "white")Yes, not shows in the decision tree and rather predominantly!
The test about negational n-grams did not include negative short forms: they had been removed as stopwords, because they are at stake in the next section and will get sui generis treatment.
Two scenarios will get a try:
First, negative short forms will no longer be removed from the corpus and will thus enter the bag of words. Impact on accuracy will be tested.
# Building up new corpus.
corpus_av1_b <-
VCorpus(VectorSource(reviews_training$text))
corpus_av1_b <-
tm_map(corpus_av1_b, content_transformer(tolower))
# Replacing all punctuation marks with empty space
# characters, instead of just removing punctuation marks,
# to prevent tokens like "brokeni" from being generated.
# Keeping apostrophes to leave intact positive short forms
# such as "it's" so that they can be removed.
for (i in 1:nrow(reviews_training)) {
corpus_av1_b[[i]]$content <-
gsub("(?!')[[:punct:]]", " ",
corpus_av1_b[[i]]$content, perl = TRUE)
}
rm(i)
# Removing only positive short forms after reducing
# to one the number of empty space characters in a row.
corpus_av1_b <- tm_map(corpus_av1_b, stripWhitespace)
corpus_av1_b <-
tm_map(corpus_av1_b, removeWords, short_forms_pos)
# Removing remaining apostrophes.
for (i in 1:nrow(reviews_training)) {
corpus_av1_b[[i]]$content <-
gsub("[[:punct:]]", " ", corpus_av1_b[[i]]$content)
}
rm(i)
# Removing stopwords_remaining, stemming, removing
# numbers, digits and multiple empty space characters
# (leaving only one empty space character at a time).
corpus_av1_b <-
tm_map(corpus_av1_b, removeWords, stopwords_remaining)
corpus_av1_b <- tm_map(corpus_av1_b, stemDocument)
corpus_av1_b <- tm_map(corpus_av1_b, removeNumbers)
corpus_av1_b <- tm_map(corpus_av1_b, stripWhitespace)
# Building bag of words, managing sparsity threshold,
# converting to data frame, regularizing column names
# and adding dependent variable.
dtm_av1_b <- DocumentTermMatrix(corpus_av1_b)
sparse_av1_b <- removeSparseTerms(dtm_av1_b, 0.995)
sentSparse_av1_b <-
as.data.frame(as.matrix(sparse_av1_b))
colnames(sentSparse_av1_b) <-
make.names(colnames(sentSparse_av1_b))
sentSparse_av1_b <- sentSparse_av1_b %>%
mutate(sentiment = reviews_training$sentiment)
# Training CART with the algorithm rpart with cp tuning.
set.seed(1)
fit_cart_tuned_av1_b <- train(sentiment ~ .,
method = "rpart",
data = sentSparse_av1_b,
tuneLength = 15,
metric = "Accuracy")
# Predicting on the training set.
fitted_cart_tuned_av1_b <-
predict(fit_cart_tuned_av1_b)
# Producing the confusion matrix on the training set.
cm_cart_tuned_av1_b <-
confusionMatrix(as.factor(fitted_cart_tuned_av1_b),
as.factor(sentSparse_av1_b$sentiment))
# Table comprised of accuracy
tab <-
data.frame(cm_cart_tuned_av1_b$overall["Accuracy"]) %>%
`rownames<-`("Model: Negation + Neg Short Forms + CART + Tuning") %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of the table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = harvard_crimson) %>%
column_spec(2, bold = T, color = "white",
background = harvard_crimson)| Accuracy on the Training Set | |
|---|---|
| Model: Negation + Neg Short Forms + CART + Tuning | 0.760479 |
Adding negative short forms such as isn’t lowers accuracy level. This will not be done as such.
Consequently, another try will be done with negative short forms: instead of being added, negative short forms will be replaced with not.
Positive short forms will still be removed from the corpus and the bag of words and negative short forms will be replaced with the word not.
Let’s have a look at the new accuracy level.
# Building up new corpus.
corpus_av1_c <-
VCorpus(VectorSource(reviews_training$text))
corpus_av1_c <-
tm_map(corpus_av1_c, content_transformer(tolower))
# Replacing all punctuation marks with empty space
# characters, instead of just removing punctuation marks,
# to prevent tokens like "brokeni" from being generated.
# Keeping apostrophes to leave intact short forms
# such as "it's" so that positive short forms can be removed.
for (i in 1:nrow(reviews_training)) {
corpus_av1_c[[i]]$content <-
gsub("(?!')[[:punct:]]", " ",
corpus_av1_c[[i]]$content, perl = TRUE)
}
rm(i)
# Adding one white space character at the beginning
# and at the end of each negative short form
# in order to prepare to use the function gsub()
# without picking up substrings.
dummy <- paste("", short_forms_neg, "")
# Replacing negative short forms with " not ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(short_forms_neg)) {
corpus_av1_c[[i]]$content <-
gsub(dummy[j], " not ",
corpus_av1_c[[i]]$content)
}
}
rm(dummy)
# Removing only positive short forms after reducing
# to one the number of empty space characters in a row.
corpus_av1_c <-
tm_map(corpus_av1_c, stripWhitespace)
corpus_av1_c <-
tm_map(corpus_av1_c, removeWords, short_forms_pos)
# Removing remaining apostrophes.
for (i in 1:nrow(reviews_training)) {
corpus_av1_c[[i]]$content <-
gsub("[[:punct:]]", " ", corpus_av1_c[[i]]$content)
}
rm(i)
# Removing stopwords_remaining, stemming, removing
# numbers, digits and multiple empty space characters
# (leaving only one empty space character at a time).
corpus_av1_c <-
tm_map(corpus_av1_c, removeWords, stopwords_remaining)
corpus_av1_c <- tm_map(corpus_av1_c, stemDocument)
corpus_av1_c <- tm_map(corpus_av1_c, removeNumbers)
corpus_av1_c <- tm_map(corpus_av1_c, stripWhitespace)
# Building bag of words, managing sparsity threshold,
# converting to data frame, regularizing column names
# and adding dependent variable.
dtm_av1_c <- DocumentTermMatrix(corpus_av1_c)
sparse_av1_c <- removeSparseTerms(dtm_av1_c, 0.995)
sentSparse_av1_c <-
as.data.frame(as.matrix(sparse_av1_c))
colnames(sentSparse_av1_c) <-
make.names(colnames(sentSparse_av1_c))
sentSparse_av1_c <- sentSparse_av1_c %>%
mutate(sentiment = reviews_training$sentiment)
# Training CART with the algorithm rpart with cp tuning.
set.seed(1)
fit_cart_tuned_av1_c <- train(sentiment ~ .,
method = "rpart",
data = sentSparse_av1_c,
tuneLength = 15,
metric = "Accuracy")
# Predicting on the training set.
fitted_cart_tuned_av1_c <-
predict(fit_cart_tuned_av1_c)
# Producing the confusion matrix on the training set.
cm_cart_tuned_av1_c <-
confusionMatrix(as.factor(fitted_cart_tuned_av1_c),
as.factor(sentSparse_av1_c$sentiment))
# Table comprised of accuracy
tab <-
data.frame(cm_cart_tuned_av1_c$overall["Accuracy"]) %>%
`rownames<-`('Model: Negation + [Neg Short Forms = "not"] + CART + Tuning') %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of the table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = harvard_crimson) %>%
column_spec(2, bold = T, color = "white",
background = harvard_crimson)| Accuracy on the Training Set | |
|---|---|
| Model: Negation + [Neg Short Forms = “not”] + CART + Tuning | 0.7874251 |
Replacing negative short forms with not downgrades accuracy. This path will not be followed.
Let’s move to the third avenue of research.
In false negatives and false positives, analysis has pinpointed unigrams and multigrams that convey subjective information.
In the line of these insights, these n-grams have been listed and classified as positive and negative. They have been inserted into four files and the files have been uploaded in the GitHub repository https://github.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited :
Here are a few examples from each file of polarized n-grams.
Positive sentiment unigrams can be found in from subj_pos_unigrams.csv (stemmed), e.g. awesom, fine, etc. Synonyms have been added, found on the internet.
Positive multigrams are stored in the file sub_pos_multigrams.csv (not stemmed but lowercased), e.g. no trouble, five star, thumbs up, a 10, must have. Possible variants have usually been added, including variants originating from spelling errors or alternative grammar: no troubles, not any trouble, not any troubles, no problem, no problems, etc.; five stars, five star, 5-star, 5star, 5 star; a ten, etc.
Negative unigrams (after stemming) are in the file subj_neg_unigrams.csv, e.g. horribl, crap, junk, etc. Synonyms have been added, found on the internet.
Negative multigrams (not stemmed but lowercased) are stored in the file sub_neg_multigrams.csv, e.g. 1 star, not good, no good, etc. Possible variants have usually been added, including variants originating from spelling errors or alternative grammar.
In the training reviews, instances of the positive n-grams will be replaced with " subjpo " and instances of negative n-grams with " subjneg ".
Efficacy-minded rules will be applied in this NLP process.
First, around n-grams from the four files mentioned above, one leading empty space character and one trailing empty space character will be added when looking for instances of these n-grams in reviews. Otherwise, matching could be completely wrong. Let’s take an example. In the file subj_pos_unigrams.csv, we see the token worth. In the bag of words, there is the token worthless, coming from training review 446. If worth is not surrounded by empty space characters, the token worthless from review 446 will be considered a match and will be replaced with " subjpo "! A negatively oriented unigram would become a positively oriented unigram! Consequently, one empty space character is added in front of and at the end of each n-gram from the four files above before looking for matching instances from reviews, in order to avoid replacing on the basis of partial substrings.
Second, as a consequence, an empty space character has to be added at the beginning and at the end of each NLP-transformed review! Otherwise, n-grams from the four files above, which are preceded and followed by one empty space character, can never match an instance that is positioned at the beginning or at the end of a review.
Third, " subjpo " and " subjneg " contain one empty space character at the beginning and at the end, in order to prevent amalgamation. Indeed, what would happen if empty space characters were not added? Let’s take the example of extra, which shows in subj_pos_unigrams.csv and in training review 418 as extra room. If " extra " were replaced with just “subjpo” in review 418, then we would get in review 418 “subjporoom”, which would no longer be a generic positive unigram! Transformation would be useless if not annoyingly counterproductive!
Fourth, multiple inter word empty space characters have to be reduced to one single inter word empty space character: indeed, listed multigrams from the files mentioned above only have one empty space character between words and could never match multigrams from reviews with several empty space characters between words.
Fifth, in training reviews, negative multigrams have got to be replaced before positive multigrams. Let’s take the example of " not a good bargain “, which is a negatively polarized multigram from the file subj_neg_multigrams.csv: if matching with instances in reviews begins with positively polarized n-grams, then” not a good bargain " in a review becomes " not a subjpo " because " good bargain " is a positively polarized multigram from the file subj_pos_multigrams.csv. " not a subjpo " might be less clear in machine learning than " subjneg "! For similar reasons, positive multigrams are matched before negative unigrams and positive unigrams.
Sixth, negative or positive polarized multigrams from files mentioned above should be tentatively matched in decreasing order in for loops. Why? Let’s take the example of " no good bargain " in one review. In sub_neg_multigrams.csv, we have " no good bargain " and " no good “; if these are considered in decreasing order, then, in the review,” no good bargain " is replaced with " subjneg “, which looks appropriate; otherwise” no good bargain " is replaced with " subjneg bargain " and then " subjneg subjpo ": consequently, instead of having one negative generic unigram we would get one positive and one negative generic unigrams, which would be ambiguous!
Seventh, in order to further fine tune text mining, combinations of negation (from the file negation.csv) and polarized n-grams from the four files above will be treated specifically: polarity will be flipped. Let’s take an example. Review 269 reads: * Att is not clear, sound is very distorted and you have to yell when you talk.* We know that clear shows in subj_pos_unigrams.csv. Applying rules already stated, we should end we " not subjpo “. With the new rule for combinations, we get” subjneg ", which is straightforward!
Eigth, the seventh rule will also be applied to any combination of short negative forms (from the file short_forms_neg.csv) and polarized n-grams from the four files above, mutatis mutandis.
The utf8 package will be used to normalize punctuation: indeed, there has been some trouble with curly apostrophes instead of straight apostrophes.
Now, it is time we reran NLP with the new rules!
# Downloading positive multigrams.
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/subj_pos_multigrams.csv"
# Creating a data frame.
subj_pos_multigrams <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
# Excluding first column, which is an index, and keeping
# only positive multigrams under the form of a vector.
subj_pos_multigrams <-
sort(subj_pos_multigrams[, 2], decreasing = TRUE) %>% as.vector()
# Converting curly apostrophes to straight apostrophes.
subj_pos_multigrams <-
sapply(subj_pos_multigrams,
utf8_normalize, map_quote = TRUE)
# Making sure there is one single leading empty space
# character, one single trailing empty space character
# and one single inter-word empty space character
# between two words. The str_squish() function removes
# leading and trailing space and avoids repeated
# inter_word space; the function paste(), with pairs
# of quotes as arguments, adds one single leading
# empty space character and one single trailing empty
# space character.
subj_pos_multigrams <-
paste("", str_squish(subj_pos_multigrams), "")
# Similar process for positive unigrams, except for UTF8
# normalization, which is irrelevant since there is
# no apostrophe in positive unigrams.
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/subj_pos_unigrams.csv"
subj_pos_unigrams <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
subj_pos_unigrams <-
subj_pos_unigrams[, 2] %>%
as.vector()
subj_pos_unigrams <-
paste("", str_trim(subj_pos_unigrams), "")
# Same process for negative multigrams
# as for positive multigrams
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/subj_neg_multigrams.csv"
subj_neg_multigrams <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
subj_neg_multigrams <-
sort(subj_neg_multigrams[, 2], decreasing = TRUE) %>%
as.vector()
subj_neg_multigrams <-
sapply(subj_neg_multigrams,
utf8_normalize, map_quote = TRUE)
subj_neg_multigrams <-
paste("", str_squish(subj_neg_multigrams), "")
# Same process for negative unigrams
# as for positive unigrams
myfile <- "https://raw.githubusercontent.com/Dev-P-L/Sentiment-Analysis__Amazon-Reviews__Revisited/master/subj_neg_unigrams.csv"
subj_neg_unigrams <-
read.csv(myfile, header = FALSE,
stringsAsFactors = FALSE)
subj_neg_unigrams <-
subj_neg_unigrams[, 2] %>%
as.vector()
subj_neg_unigrams <-
paste("", str_trim(subj_neg_unigrams), "")
rm(myfile)
# Creating and lowercasing corpus.
corpus_av2 <-
VCorpus(VectorSource(reviews_training$text))
corpus_av2 <-
tm_map(corpus_av2, content_transformer(tolower))
# Replacing all punctuation marks by spaces
# except for apostrophes and hyphens.
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
gsub("[.?!]", " ",
gsub("(?![-.?!'])[[:punct:]]", " ",
corpus_av2[[i]]$content, perl=T))
}
# Removing empty space characters at the beginning
# and at the end of reviews to get apostrophes and
# hyphens in first or last position if they are
# at the beginning or at the end of a review.
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
str_trim(corpus_av2[[i]]$content)
}
# Removing apostrophes and hyphens at the beginning
# and at the end of reviews, with repetition
# (in case there are several of them).
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
gsub("^[[:punct:]]+|[[:punct:]]+$","",
corpus_av2[[i]]$content)
}
# Making sure there is one single leading empty space
# character, one single trailing empty space character
# and one single inter-word empty space character
# between two words. The str_squish() function removes
# leading and trailing space and avoids repeated
# inter_word space; the function paste(), with pairs
# of quotes as arguments, adds one single leading empty
# space character and one single trailing empty space character.
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
paste("", str_squish(corpus_av2[[i]]$content), "")
}
# Matching multigrams from reviews with polarized
# multigrams from subj_neg_multigrams.csv or
# subj_pos_multigrams.csv. If matching works,
# replacing multigrams from reviews with
# generic polarized unigram " subjneg " or " subjpo ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_neg_multigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_neg_multigrams[j], " subjneg ",
corpus_av2[[i]]$content)
}
}
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_pos_multigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_pos_multigrams[j], " subjpo ",
corpus_av2[[i]]$content)
}
}
# Replacing all non intraword remaining apostrophes
# and all non intraword remaining hyphens
# with single empty space character.
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
gsub("' | '| ' |- | -| - ", " ",
corpus_av2[[i]]$content)
}
# Removing stopwords from remaining_stopwords.csv .
corpus_av2 <-
tm_map(corpus_av2, removeWords, stopwords_remaining)
# Stemming reviews.
corpus_av2 <- tm_map(corpus_av2, stemDocument)
# The function stemDocument does not only stem
# but also suppresses spaces at the beginning
# and at the end of each review (as well as all
# repeated empty space characters). Consequently,
# one space has to be added again at the beginning
# and at the end of each review.
for (i in 1:nrow(reviews_training)) {
corpus_av2[[i]]$content <-
paste("", corpus_av2[[i]]$content, "")
}
# Removing numbers, digits.
corpus_av2 <- tm_map(corpus_av2, removeNumbers)
# Removing repeated empty space characters.
corpus_av2 <- tm_map(corpus_av2, stripWhitespace)
# Polarizing multigrams again after stemming
# of reviews. Some multigrams might have become
# eligible after stemming of reviews.
# Not highly probable but not excluded.
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_neg_multigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_neg_multigrams[j], " subjneg ",
corpus_av2[[i]]$content)
}
}
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_pos_multigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_pos_multigrams[j], " subjpo ",
corpus_av2[[i]]$content)
}
}
# Matching polarized unigrams with unigrams
# in reviews and, if such is the case,
# replacing matching unigrams from reviews
# with a generic polarized unigram.
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_neg_unigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_neg_unigrams[j], " subjneg ",
corpus_av2[[i]]$content)
}
}
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(subj_pos_unigrams)) {
corpus_av2[[i]]$content <-
gsub(subj_pos_unigrams[j], " subjpo ",
corpus_av2[[i]]$content)
}
}
# Flipping polarity of any combination
# negation + " subjneg ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(negation)) {
corpus_av2[[i]]$content <-
gsub(paste("", negation[j], " subjneg ", sep = ""),
" subjpo ", corpus_av2[[i]]$content)
}
}
# Flipping polarity of any combination of
# negative short form + " subjneg ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(short_forms_neg)) {
corpus_av2[[i]]$content <-
gsub(paste("", short_forms_neg[j], " subjneg ", sep = ""),
" subjpo ", corpus_av2[[i]]$content)
}
}
# Flipping polarity of any combination
# negation + " subjpo ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(negation)) {
corpus_av2[[i]]$content <-
gsub(paste("", negation[j], " subjpo ", sep = ""),
" subjneg ", corpus_av2[[i]]$content)
}
}
# Flipping polarity of any combination of
# negative short form + " subjpo ".
for (i in 1:nrow(reviews_training)) {
for (j in 1:length(short_forms_neg)) {
corpus_av2[[i]]$content <-
gsub(paste("", short_forms_neg[j], " subjpo ", sep = ""),
" subjneg ", corpus_av2[[i]]$content)
}
}
# Removing negative and positive short forms and
# repeated empty space characters.
corpus_av2 <-
tm_map(corpus_av2, removeWords, short_forms_neg)
corpus_av2 <-
tm_map(corpus_av2, removeWords, short_forms_pos)
corpus_av2 <- tm_map(corpus_av2, stripWhitespace)
# Creating document term matrix, handling sparsity,
# converting to data frame, making column names
# R-friendly and adding independent variable.
dtm_av2 <- DocumentTermMatrix(corpus_av2)
sparse_av2 <- removeSparseTerms(dtm_av2, 0.995)
sentSparse_av2 <-
as.data.frame(as.matrix(sparse_av2))
rownames(sentSparse_av2) <-
1:nrow(sentSparse_av2)
colnames(sentSparse_av2) <-
make.names(colnames(sentSparse_av2))
sentSparse_av2 <-
sentSparse_av2 %>%
mutate(sentiment = reviews_training$sentiment)
# Building a CART model with cp tuning.
set.seed(1)
fit_cart_tuned_av2 <- train(sentiment ~ .,
method = "rpart",
data = sentSparse_av2,
tuneLength = 15,
metric = "Accuracy")
# Predicting on the training set.
fitted_cart_tuned_av2 <- predict(fit_cart_tuned_av2)
# Producing the confusion matrix on the training set.
cm_cart_tuned_av2 <-
confusionMatrix(as.factor(fitted_cart_tuned_av2),
as.factor(sentSparse_av2$sentiment))
# Table with accuracy level.
tab <-
data.frame(cm_cart_tuned_av2$overall["Accuracy"]) %>%
`rownames<-`("Model: Negation + Polarization + CART + Tuning") %>%
`colnames<-`("Accuracy on the Training Set")
# Layout of table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = greenish_blue) %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue)| Accuracy on the Training Set | |
|---|---|
| Model: Negation + Polarization + CART + Tuning | 0.9251497 |
Polarizing n-grams that convey subjective information has dramatically boosted accuracy from approximately 79 % to 93 %. Nevertheless, let us stay realistic: text mining has been performed on the training set and is not necessarily, or at least not necessarily entirely, transposable to the validation set.
To get some visual representation of changes, let’s have a look at a wordcloud from the new bag of words.
df <- sentSparse_av2 %>%
select(-ncol(sentSparse_av2))
df <-
data.frame(word = colnames(df),
freq = colSums(df)) %>%
filter(freq >= 10) %>%
arrange(desc(freq)) %>%
head(., 40)
# Second, let's create the wordcloud.
set.seed(1)
wordcloud2(df, shape = 'square',
color = 'random-light',
backgroundColor = "blue",
shuffle = FALSE)The whole picture has changed. The predominant tokens are now subjpo, with 598 instances, and subjneg, with 220 instances. not* is present, at the fourth level, with 51 instances, just after phone, with 116 instances.
The following histogram adds some quantitative insight.
# Retrieving new bag of words.
df <- sentSparse_av2 %>% select(- ncol(.))
# Data frame with tokens and frequencies
freq <-
data.frame(to = colnames(df),
fre = as.integer(colSums(df)),
stringsAsFactors = FALSE) %>%
arrange(desc(fre)) %>%
head(., 12)
# Building up the histogram.
graph <- freq %>% mutate(to = reorder(to, fre)) %>%
ggplot(aes(to, fre)) +
geom_bar(stat = "identity", width = 0.80,
color = "#007ba7", fill = "#007ba7") +
coord_flip() +
ggtitle("Token Frequency") +
xlab("Token") + ylab("Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = 16,
face = "bold"),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust = 1,
size = 12),
axis.text.y = element_text(size = 12))
# Making the graph interactive.
p <- ggplotly(graph, dynamicTicks = TRUE,
width = 500, height = 500)
# Centering the graph, because the centering
# opts_chunk previously inserted is not operative
# in the case of the ggplotly() function.
htmltools::div(p, align = "center")Is there any parallel evolution in the decision tree?
palette <- c(super_light_gray, super_light_taupe)
prp(fit_cart_tuned_av2$finalModel,
uniform = TRUE, cex = 1,
box.palette = palette, border.col = "white")Yes, the least we can say is that indeed there is now more parallelism between wordcloud/histogram and decision tree! The two predominant tokens in the wordcloud are now the only decision nodes in the decision tree. The first node is occupied by subjpo; then comes subjneg. Many individual tokens that were previously in nodes of the tree, have disappeared. Even not and but, which show up in the wordcloud, have faded away. Maybe a more complex machine learning model could still marginally retrieve additional information from tokens other than subjpo and subjneg. This will be experimented in the section Machine Learning (ML).
The next table summarizes accuracy results obtained so far.
# Performance metric names
colname <- c("MODEL ID", "SHORT DESCRIPTION",
"ACCURACY", "SENSITIVITY", "NEG PRED VAL",
"SPECIFICITY", "POS PRED VAL")
# Model names
models <- c("cart_tuned_av1_a", "cart_tuned_av1_b",
"cart_tuned_av1_c", "cart_tuned_av2")
# Model descriptions
description <-
c("CART + tuning + negation",
"CART + tuning + negation + neg short forms",
'CART + tuning + negation + neg short forms = "not"',
"CART + tuning + negation + polarization")
# Retrieving confusion matrices.
cm <- c("cm_cart_tuned_av1_a", "cm_cart_tuned_av1_b",
"cm_cart_tuned_av1_c", "cm_cart_tuned_av2")
# Building up receptacle data frame for performance metrics.
tab <-
data.frame(matrix(1:(length(colname) * length(models)),
ncol = length(colname),
nrow = length(models)) * 1)
# Filling in the receptacle data frame
# with performance metrics.
for (i in 1:length(models)) {
tab[i, 1] <- models[i]
tab[i, 2] <- description[i]
tab[i, 3] <-
eval(parse(text =
paste(cm[i], "$overall['Accuracy']", sep = "")))
tab[i, 4] <-
eval(parse(text =
paste(cm[i], "$byClass['Sensitivity']", sep = "")))
tab[i, 5] <-
eval(parse(text =
paste(cm[i], "$byClass['Neg Pred Value']", sep = "")))
tab[i, 6] <-
eval(parse(text =
paste(cm[i], "$byClass['Specificity']", sep = "")))
tab[i, 7] <-
eval(parse(text =
paste(cm[i], "$byClass['Pos Pred Value']", sep = "")))
}
# Rounding performance metrics and adding column names.
tab_av_1_2 <-
tab %>% mutate_at(vars(3:7), funs(round(., 4))) %>%
`colnames<-`(colname)
# Recalling previous table and making sure
# the colnames are regularized.
tab <-
tab_av0 %>%
`colnames<-`(colname)
# Stacking up the two tables in a global one.
tab_av_0_1_2 <- rbind(tab, tab_av_1_2)
knitr::kable(tab_av_0_1_2, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(c(1, 5:6), bold = T, strikeout = T,
color = "white",
background = harvard_crimson) %>%
row_spec(2:4, bold = T, color = greenish_blue,
background = super_light_taupe) %>%
row_spec(7, bold = T, color = "white",
background = greenish_blue)| MODEL ID | SHORT DESCRIPTION | ACCURACY | SENSITIVITY | NEG PRED VAL | SPECIFICITY | POS PRED VAL |
|---|---|---|---|---|---|---|
| baseline | Baseline Model | 0.5000 | 1.0000 | Div. by 0 | 0.0000 | 0.5000 |
| cart_av0 | CART | 0.7680 | 0.6407 | 0.7136 | 0.8952 | 0.8594 |
| cart_tuned_av0 | CART + Tuning | 0.7874 | 0.7365 | 0.7609 | 0.8383 | 0.8200 |
| cart_tuned_av1_a | CART + tuning + negation | 0.7949 | 0.7305 | 0.7613 | 0.8593 | 0.8385 |
| cart_tuned_av1_b | CART + tuning + negation + neg short forms | 0.7605 | 0.5749 | 0.69 | 0.9461 | 0.9143 |
| cart_tuned_av1_c | CART + tuning + negation + neg short forms = “not” | 0.7874 | 0.7126 | 0.75 | 0.8623 | 0.8380 |
| cart_tuned_av2 | CART + tuning + negation + polarization | 0.9251 | 0.9431 | 0.941 | 0.9072 | 0.9104 |
In the table above, on rows 1, 5 and 6, fonts have been stricken through to indicate that these models have been discarded for lack of performance.
On the other rows, i.e. rows 2 to 4 and 7, the models should be seen as a cumulative process bringing accuracy improvement in a stepwise and incremental way.
In particular, as shown by comparing rows 4 and 7, polarization allowed accuracy to jump from 79 % up to 93 %, which is impressive.
More impressive: sensitivity has sprung from 73 % to 94 %. This is linked to false negative management. False negatives have been a recurrent weak point in machine learning results up to now. But special attention has been paid to them in debriefing previous machine learning results and in perusing false negatives and false positives.
Let’s have a look at the numbers of remaining false negatives and positives in the following confusion matrix.
tab <-
table(fitted_cart_tuned_av2, sentSparse_av2$sentiment) %>%
as.vector()
tab <-
data.frame(matrix(tab, ncol = 2, nrow = 2, byrow = FALSE)) %>%
`colnames<-`(c("Actually positive", "Actually negative")) %>%
`rownames<-`(c("Predicted positive AFTER POLARIZING",
"Predicted negative AFTER POLARIZING"))
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
column_spec(1, bold = T, color = "black") %>%
column_spec(2, bold = T, color = "white",
background = greenish_blue) %>%
column_spec(3, bold = T, color = "white",
background = harvard_crimson) | Actually positive | Actually negative | |
|---|---|---|
| Predicted positive AFTER POLARIZING | 315 | 31 |
| Predicted negative AFTER POLARIZING | 19 | 303 |
# sentSparse_av2 will be preserved for further use in the part about machine learning.
rm(reviews_training)
rm(corpus_av1_a, dtm_av1_a, sparse_av1_a)
rm(fit_cart_tuned_av1_a, fitted_cart_tuned_av1_a, cm_cart_tuned_av1_a)
rm(corpus_av1_b, dtm_av1_b, sparse_av1_b, sentSparse_av1_b)
rm(fit_cart_tuned_av1_b, fitted_cart_tuned_av1_b, cm_cart_tuned_av1_b)
rm(corpus_av1_c, dtm_av1_c, sparse_av1_c, sentSparse_av1_c)
rm(fit_cart_tuned_av1_c, fitted_cart_tuned_av1_c, cm_cart_tuned_av1_c)
rm(corpus_av2, dtm_av2, sparse_av2)
rm(fitted_cart_tuned_av2, cm_cart_tuned_av2)
rm(df, freq, i, j, colname, tab, tab_av_0_1_2)
# fit_cart_tuned_av2 is kept for further use.With respect to the model before Text Mining, Text Mining has crushed the number of false negatives from 88 to 19. There is also a decrease in false positives, much more modest though, from 54 to 31.
After improving accuracy thanks to NLP and text mining, new accuracy improvements will be looked for in the next section through machine learning optimization.
Two models are going to be applied.
Two machine learning models have been chosen: CART and eXtreme Gradient Boosting Tree.
CART has been our yardstick up to now; it will help evaluate the performance from XGBoost Tree. It will also be run on the validation set, where it will once again allow to better evaluate the performance of the other model.
XGBoost had emerged among ten models as the most performing one in a previous version of this project — see the Executive Summary. Without further ado, XGBoost will be taken again, but in the previous version it was XGBoost Linear and this time it is XGBoost Tree, further to reports in Machine Learning literature.
Here is some model signage.
# Signage of the two models, i.e. CART and XGBoost Tree
IDs <- c("cart_15", "xgbTree_5")
models <- c("CART rpart",
"eXtreme Gradient Boosting Tree")
caret_names <- c("rpart", "xgbTree")
tunings <- c(15, 5)
nr_resamples <- rep(25, 2)
# Colnames of the model presentation table
# of models signage
colname_methods <-
c("MODEL ID", "MODEL", "NAME IN CARET",
"# TUNING VALUES", "# BOOTSTRAPPED RESAMPLES")
# Presentation table of models signage
tab <-
data.frame(IDs, models, caret_names,
tunings, nr_resamples) %>%
`colnames<-`(colname_methods)
# Layout of the table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(1:2, bold = T, color = "white",
background = greenish_blue) | MODEL ID | MODEL | NAME IN CARET | # TUNING VALUES | # BOOTSTRAPPED RESAMPLES |
|---|---|---|---|---|
| cart_15 | CART rpart | rpart | 15 | 25 |
| xgbTree_5 | eXtreme Gradient Boosting Tree | xgbTree | 5 | 25 |
Both models will be trained with the train() function from the package caret.
CART will be tuned on 15 values of the cp parameter on 25 bootstrapped resamples for each cp value. XGBoost Tree will be tuned on all combinations of five values of each of the five tuning parameters — i.e. 25 combinations — on 25 bootstrapped resamples for each combination.
The names of the parameters tuned by the train() function on XGBoost Tree are available in http://topepo.github.io/caret/available-models.html .
The accuracy results are summarized in the next table.
# Reinstating existing and saved training set
# sentSparse_av2.
train <- sentSparse_av2
rm(sentSparse_av2)
# List for models output
fits <- list(1)
# rpart has already been run and will not be rerun.
fits[[1]] <- fit_cart_tuned_av2
# Running XGBoost Tree
set.seed(1)
fits[[2]] <- train(sentiment ~ .,
method = caret_names[2],
data = train,
tuneLength = 5,
metric = "Accuracy")
# Naming result bulk.
names(fits) <- IDs
# Getting predictions on training set.
df <- data.frame(matrix(1:(nrow(train) * length(fits)),
ncol = length(fits), nrow = nrow(train)) * 1)
for (i in 1:length(fits)) {
df[, i] <- predict(fits[[i]])
}
# Using predictions on the training set
# to compute accuracy for each model.
tab <- data.frame(matrix(1:(length(fits)),
ncol = 1, nrow = length(fits)))
for (i in 1:length(fits)) {
tab[i, 1] <- mean(df[, i] == train$sentiment)
}
# Preparing column names for result table.
colname_results <-
c("MODEL ID", "MODEL", "# TUNING VALUES",
"# BOOTSTRAPPED RESAMPLES",
"ACCURACY ON THE TRAINING SET")
# Building up result table.
tab <- tab %>%
`colnames<-`("acc") %>%
mutate(acc = round(acc, 4)) %>%
mutate(ID = IDs) %>%
mutate(mod = models) %>%
mutate(tuning = tunings) %>%
mutate(boot = nr_resamples) %>%
select(ID, mod, tuning, boot, acc) %>%
arrange(desc(acc)) %>%
`colnames<-`(colname_results)
# Layout of the table and printing
knitr::kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(1, bold = T, color = "white",
background = greenish_blue) %>%
row_spec(2, bold = T, color = "white",
background = harvard_crimson)| MODEL ID | MODEL | # TUNING VALUES | # BOOTSTRAPPED RESAMPLES | ACCURACY ON THE TRAINING SET |
|---|---|---|---|---|
| xgbTree_5 | eXtreme Gradient Boosting Tree | 5 | 25 | 0.9296 |
| cart_15 | CART rpart | 15 | 25 | 0.9251 |
Not surprisingly, XGBoost Tree delivers better results than CART but in a very marginal measure.
Better results are not surprising when taking XGBoost Tree reputation into account.
The narrow margin between the two models is not surprising either: indeed, focus has been put on Text Mining, a lot of work has been done and information has already been largely canalized through polarization.
Let’s run both models on the validation set.
The validation set has to be constructed in the same way as the training set.
In order to predict on the validation set, the training set and the validation set have to contain the same columns, i.e. the same labels (dependent variable) and the same predictors. For labels, there is no problem. Let’s see about predictors.
The training set already exists, it is sentSparse_av2, which has been built up through NLP and with Text Mining insights. The training set cannot be changed to match the validation set since this would imply, in one way or another, retrieving some information from the validation and instilling it into the training set, which is contrary with the very status of a validation set.
The validation set will be constructed in the same way as the training set through NLP and Text Mining. Columns from the validation set will be aligned on the training set, of course not in content but in headers, in column names (i.e. in tokens). Columns that are in the validation set but not in the training set will be removed. Columns that are in the training set but not in the validation set will be added as null vectors to the validation set.
# Retrieving validation reviews.
reviews_val <- reviews[ind_val, ]
# Creating and lowercasing corpus.
corpus <-
VCorpus(VectorSource(reviews_val$text))
corpus <-
tm_map(corpus, content_transformer(tolower))
# Replacing all punctuation marks by empty space
# characters except for apostrophes and hyphens.
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
gsub("[.?!]", " ",
gsub("(?![-.?!'])[[:punct:]]", " ",
corpus[[i]]$content, perl = T))
}
# Removing empty space characters at the beginning
# and at the end of reviews to get apostrophes and
# hyphens in first or last position if they are
# at the beginning or at the end of a review.
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
str_trim(corpus[[i]]$content)
}
# Removing apostrophes and hyphens at the beginning
# and at the end of reviews, with repetition
# (in case there are several of them).
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
gsub("^[[:punct:]]+|[[:punct:]]+$","",
corpus[[i]]$content)
}
# Making sure there is one single leading empty space
# character, one single trailing empty space character
# and one single inter-word empty space character
# between two words. The str_squish() function removes
# leading and trailing space and avoids repeated
# inter word space; the function paste(), with pairs
# of quotes as arguments, adds one single leading empty
# space character and one single trailing empty space
# character.
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
paste("", str_squish(corpus[[i]]$content), "")
}
# Polarizing review multigrams by substitution.
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_neg_multigrams)) {
corpus[[i]]$content <-
gsub(subj_neg_multigrams[j], " subjneg ",
corpus[[i]]$content)
}
}
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_pos_multigrams)) {
corpus[[i]]$content <-
gsub(subj_pos_multigrams[j], " subjpo ",
corpus[[i]]$content)
}
}
# Replacing all non intraword remaining apostrophes
# and all non intraword remaining hyphens
# with single empty space character.
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
gsub("' | '| ' |- | -| - ", " ",
corpus[[i]]$content)
}
# Removing stopwords stored in remaining_stopwords.csv .
corpus <-
tm_map(corpus, removeWords, stopwords_remaining)
# Stemming reviews.
corpus <- tm_map(corpus, stemDocument)
# The function stemDocument does not only stem
# but also suppresses empty space characters
# at the beginning and at the end of each review
# (as well as all repeated empty space characters).
# Consequently, one empty space character has to be
# added again at the beginning and at the end of each review.
for (i in 1:nrow(reviews_val)) {
corpus[[i]]$content <-
paste("", corpus[[i]]$content, "")
}
# Removing numbers, digits.
corpus <- tm_map(corpus, removeNumbers)
# Removing repeated empty space characters.
corpus <- tm_map(corpus, stripWhitespace)
# Polarizing multigrams again after stemming.
# Some multigrams might have become eligible
# after stemming. Not highly probable but
# not impossible either.
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_neg_multigrams)) {
corpus[[i]]$content <-
gsub(subj_neg_multigrams[j], " subjneg ",
corpus[[i]]$content)
}
}
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_pos_multigrams)) {
corpus[[i]]$content <-
gsub(subj_pos_multigrams[j], " subjpo ",
corpus[[i]]$content)
}
}
# Polarizing unigrams by substitution.
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_neg_unigrams)) {
corpus[[i]]$content <-
gsub(subj_neg_unigrams[j], " subjneg ",
corpus[[i]]$content)
}
}
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(subj_pos_unigrams)) {
corpus[[i]]$content <-
gsub(subj_pos_unigrams[j], " subjpo ",
corpus[[i]]$content)
}
}
# Flipping polarity of any combination
# negation + " subjneg ".
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(negation)) {
corpus[[i]]$content <-
gsub(paste("", negation[j], " subjneg ", sep = ""),
" subjpo ", corpus[[i]]$content)
}
}
# Flipping polarity of any combination of
# negative short form + " subjneg ".
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(short_forms_neg)) {
corpus[[i]]$content <-
gsub(paste("", short_forms_neg[j], " subjneg ", sep = ""),
" subjpo ", corpus[[i]]$content)
}
}
# Flipping polarity of any combination
# negation + " subjpo ".
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(negation)) {
corpus[[i]]$content <-
gsub(paste("", negation[j], " subjpo ", sep = ""),
" subjneg ", corpus[[i]]$content)
}
}
# Flipping polarity of any combination of
# negative short form + " subjpo ".
for (i in 1:nrow(reviews_val)) {
for (j in 1:length(short_forms_neg)) {
corpus[[i]]$content <-
gsub(paste("", short_forms_neg[j], " subjpo ", sep = ""),
" subjneg ", corpus[[i]]$content)
}
}
# Removing negative and positive short forms and
# repeated empty space characters.
corpus <- tm_map(corpus, removeWords, short_forms_neg)
corpus <- tm_map(corpus, removeWords, short_forms_pos)
corpus <- tm_map(corpus, stripWhitespace)
# Creating Document Term Matrix.
dtm <- DocumentTermMatrix(corpus)
# No sparsity threshold is applied, since this could
# discard tokens present in the separately built
# training set. Converting to data frame, making
# colnames R-friendly and adding dependent variable.
sentSparse <- as.data.frame(as.matrix(dtm))
colnames(sentSparse) <-
make.names(colnames(sentSparse))
sentSparse <- sentSparse %>%
mutate(sentiment = reviews_val$sentiment) %>%
as.data.frame()
val <- sentSparse
# For machine learning, columns have to match
# between training set and test set: adjustments
# have to be made on the validation set.
# Let's keep only columns that also exist
# in the training set. The column "sentiment"
# will remain since the name exists in both sets.
val <- val %>%
as.data.frame() %>%
select(intersect(colnames(.), colnames(train)))
# Columns from the training set that are missing in "val"
# have to be added as null vectors.
mis <- setdiff(colnames(train), colnames(val))
df <-
data.frame(matrix((nrow(val) * length(mis)),
nrow = nrow(val),
ncol = length(mis)) * 0) %>%
`colnames<-`(mis)
val <- cbind(val, df) %>%
as.data.frame()
rm(subj_neg_multigrams, subj_neg_unigrams,
subj_pos_multigrams, subj_pos_unigrams)
rm(dtm, sentSparse)
rm(i, j)Models can now be validated on the validation set: predictions will be computed on the validation set and then accuracy.
# Predictions on the validation set with XGBoost Tree
# and CART for further use
pred_xgbTree_5 <-
predict(fits$xgbTree_5, newdata = val)
pred_cart_15 <-
predict(fits$cart_15, newdata = val)
# Accuracy on the validation set with XGBoost Tree and CART
acc_xgbTree_5 <-
round(mean(pred_xgbTree_5 == val$sentiment), 4)
acc_cart_15 <-
round(mean(pred_cart_15 == val$sentiment), 4)
# Accuracy on the validation set with baseline model
ref <- as.character(val$sentiment)
pred_baseline <-
data.frame(sentiment = rep(" Pos", nrow(val)),
stringsAsFactors = FALSE)
acc_baseline <-
sprintf("%.4f",
round(mean(pred_baseline$sentiment == val$sentiment), 4))
# Table of accuracy on the validation set
# with baseline model, CART and XGBoost Tree.
tab <-
data.frame(matrix(c(acc_baseline, acc_cart_15, acc_xgbTree_5),
nrow = 3, ncol = 1)) %>%
`colnames<-`("ACCURACY ON THE VALIDATION SET") %>%
`rownames<-`(c("Baseline Model",
"NLP + Text Mining + CART rpart + 15 Tuning Iterations",
"NLP + Text Mining + XGBoost Tree + 5 Tuning Iterations"))
# Layout of table and printing
kable(tab, "html", align = "c") %>%
kable_styling(bootstrap_options = "bordered",
full_width = F, font_size = 16) %>%
row_spec(1, bold = T, color = "white",
background = harvard_crimson) %>%
row_spec(2, bold = T, color = greenish_blue,
background = super_light_taupe) %>%
row_spec(3, bold = T, color = "white",
background = greenish_blue)| ACCURACY ON THE VALIDATION SET | |
|---|---|
| Baseline Model | 0.5000 |
| NLP + Text Mining + CART rpart + 15 Tuning Iterations | 0.9036 |
| NLP + Text Mining + XGBoost Tree + 5 Tuning Iterations | 0.9096 |
XGBoost Tree delivers an accuracy level of 91 % on the validation set, afainst 90 % with CART. Actually, the difference is even smaller than one percentage point.
For prospective purposes, false negatives and false positives generated by XGBoost Tree are available in two tables below. Some of them are probably inevitable but showing the prediction limits could be the starting point of a new project, on another data set.
# To identify false negatives, we need both
# the actual review polarity and the predicted
# review polarity. Consequently, we are going
# to combine both variables in one data frame.
df <- data.frame(sentiment = reviews_val$sentiment,
pred = pred_xgbTree_5)
# We have a false negative if actual review polarity
# is positive and if predicted review polarity is negative.
# The result of the command below is 1 for false negatives,
# -1 for false positives and 0 in other cases
# (true positive or true negative). So, 1 corresponds
# to what we are looking for, i.e. false negatives.
FN_val <- ifelse(df$sentiment == " Pos", 1, 0) -
ifelse(df$pred == " Pos", 1, 0)
# Now, we have to generate a dichotomic vector
# with one specific value for false negatives
# or another specific value for all other cases
# (false positives, true positives or true negatives).
# That's exactly what the next command does. Indeed,
# if the command above gives 1 (false negative),
# then the command below delivers 1 as well
# while delivering 0 in all other cases
# (false positives, true positives or true negatives).
FN_val <- ifelse(FN_val == 1, 1, 0)
# Row numbers corresponding to false negatives
FN <- which(FN_val == 1)
# Now let's build up an interactive table
# with all false negatives delivered by XGB.
# Let's create a receptacle data frame.
df_fn <- data.frame(row = FN,
review = as.character(1:length(FN)),
tokenized = as.character(1:length(FN))) %>%
`colnames<-`(c("Row",
"Validation Review Leading to a False Negative with XGB",
"Tokenized"))
# In order to populate the receptacle data frame,
# let's build up a for loop garnering data, i.e. row number,
# training review and tokenized training review.
for (i in 1:length(FN)) {
row <- FN[i]
df_fn[i, 2] <- reviews_val$text[row]
df_fn[i, 3] <- corpus[[row]]$content
}
rm(i, row)
# Converting row numbers to characters in order ...
# to have them left-aligned in the interactive data table below.
df_fn <- df_fn %>%
mutate(Row = as.character(Row))
# Creating the interactive data table, using the DT package.
datatable(df_fn, rownames = FALSE, filter = "top",
options = list(pageLength = 5, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#319b54',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#8adaa5";','}',
'}')
)
)Now, let’s have a look at false positives generated by XGBoost Tree on the validation set.
# To identify false positives, we need both
# the actual review polarity and the predicted
# review polarity. Consequently, we are going
# to combine both variables in one data frame.
df <- data.frame(sentiment = reviews_val$sentiment,
pred = pred_xgbTree_5)
# We have a false positive if actual review polarity
# is negative and if predicted review polarity is positive.
# The result of the command below is 1 for false positives,
# -1 for false negatives and 0 in other cases
# (true positive or true negative). So, 1 corresponds
# to what we are looking for, i.e. false positives.
FP_val <- ifelse(df$sentiment == "Neg", 1, 0) -
ifelse(df$pred == "Neg", 1, 0)
# Now, we have to generate a dichotomic vector
# with one specific value for false positives
# and another specific value for all other cases
# (false negatives, true positives or true negatives).
# That's exactly what the next command does.
# Indeed, if the command above gives 1 (false positive),
# then the command below delivers 1 as well
# while delivering 0 in all other cases
# (false negatives, true positives or true negatives).
FP_val <- ifelse(FP_val == 1, 1, 0)
# Row numbers corresponding to false positives
FP <- which(FP_val == 1)
# Now let's build up an interactive table with
# all false positives delivered by XGB.
# Let's create a receptacle data frame.
df_fp <-
data.frame(row = FP,
review = as.character(1:length(FP)),
tokenized = as.character(1:length(FP))) %>%
`colnames<-`(c("Row",
"Validation Review Leading to a False Positive",
"Tokenized"))
# In order to populate the receptacle data frame,
# let's build up a for loop garnering data, i.e. row number,
# training review and tokenized training review.
for (i in 1:length(FP)) {
row <- FP[i]
df_fp[i, 2] <- reviews_val$text[row]
df_fp[i, 3] <- corpus[[row]]$content
}
rm(i, row)
# Converting row numbers to characters in order ...
# to have them left-aligned in the interactive
# data table below.
df_fp <- df_fp %>%
mutate(Row = as.character(Row))
# Creating the interactive data table, using the DT package.
datatable(df_fp, rownames = FALSE, filter = "top",
options = list(pageLength = 4, scrollX = T,
# Sets background color and font color in header.
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({
'background-color': '#319b54',
'color': 'white'});",
"}"),
# Sets background color in rows.
rowCallback = JS(
'function(row, data, index, rowId) {',
'console.log(rowId)',
'if(rowId >= 0) {',
'row.style.backgroundColor = "#8adaa5";','}',
'}')
)
)On the validation set, xgb_05 provides an accuracy level of 88 %, which is substantially higher than the 50 % accuracy level from a baseline model.
On the validation set, the baseline delivers an accuracy level of 50 % (just as on the training set). On the validation set again, the final model provides accuracy of 88 %, which is substantially higher.
Where does the improvement come from? From which step in the whole process: NLP, text mining or machine learning optimization? Splitting contribution to results needs to be done on the validation set since results on the training set can be boosted by overfitting. To evaluate input from each layer, a simple approach will we followed.
On the validation set without polarization and without negation, prediction will be conducted and accuracy produced. The method will be cart_15, i.e. CART with 15-value tuning and 25 bootstrapped resamples: this model is rather fast and can be considered as a yardstick, having proved resilient although not optimal. This will help evaluate the impact of NLP.
Then, on the validation set with negation, a second evaluation will be conducted with cart_15: this will help evaluate the impact of integrating negation.
On the validation set with negation and polarization, a third evaluation will be conducted with cart_15: this will help evaluate the impact of polarization.
The accuracy difference between this last result and the result from eXtreme Gradient Boosting will measure the impact of machine learning optimization.
88 % prediction accuracy has been reached on the validation set, against 50 % with a baseline model. Which factors have contributed towards that improvement with 38 percentage points?
Natural Language Processing has contributed 21.7 percentage points.
Text mining has brought additional accuracy improvement with 12.7 percentage points.
Machine learning optimization has boosted accuracy with 3.6 additional percentage points.
In this sentiment analysis project, a three-tier approach has lifted accuracy out of a baseline 50 % to 88 %: NLP (22 %), text mining (13 %) and machine learning optimization (4 %).
The Executive Summary, at the very beginning of this document, provides a nice overview. A dynamic table of content allows easy access to more detailed information.
In particular, the main insights from text mining can be found in “VI. INFORMATION RETRIEVAL USING INSIGHTS, C. Polarization - Text Classification - Text Substitution”. Instead of using existing dictionaries, customized lists of polarized tokens have been established from perusing unused subjective information available in false negatives and false positives. In reviews, instances matching these polarized tokens have been replaced by a generic token either positive or negative, boosting use of subjective information.
This method has showed rather resilient. With insights limited to the training set, it has brought 10 % accuracy improvement on the validation set, i.e. almost as much as the 11 % accuracy increase on the training set.
Machine learning optimization has been conducted across ten models. eXtreme Gradient Boosting has emerged as the most performing model in this project and has boosted accuracy with 4 additional percentage points. Testing has been performed on bootstrapped accuracy distributions.
Dear Readers,
Thank you for reaching the end. Please don’t hesitate to get in touch with me through my GitHub email address. I am interested in all kinds of comments.
Availability has been checked up on March 31, 2021.
https://www.edx.org/course/the-analytics-edge
https://www.tidytextmining.com/sentiment.html
https://medium.com/@annabiancajones/sentiment-analysis-of-reviews-text-pre-processing-6359343784fb
https://cran.r-project.org/web/packages/SentimentAnalysis/vignettes/SentimentAnalysis.html
https://monkeylearn.com/sentiment-analysis/
https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386
Distribution of accuracy from resamples
https://books.google.be/books?id=GgmqDwAAQBAJ&pg=PA80&lpg=PA80&dq#v=onepage&q&f=false
https://www.edx.org/course/data-science-machine-learning
https://holtzy.github.io/Pimp-my-rmd/
https://rstudio.github.io/DT/options
https://rstudio.github.io/DT/010-style.html
https://stackoverflow.com/questions/46853567/centering-plotly-output-to-html
https://stackoverflow.com/questions/25646333/code-chunk-font-size-in-rmarkdown-with-knitr-and-latex
https://bookdown.org/yihui/rmarkdown-cookbook/chunk-styling.html
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C
## [5] LC_TIME=French_France.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.16 htmltools_0.5.0 plotly_4.9.3 devtools_2.3.2
## [5] usethis_2.0.0 utf8_1.1.4 gridExtra_2.3 kableExtra_1.3.1
## [9] xgboost_1.2.0.1 caret_6.0-86 lattice_0.20-41 rpart.plot_3.0.9
## [13] rpart_4.1-15 caTools_1.18.0 RColorBrewer_1.1-2 wordcloud2_0.2.3
## [17] e1071_1.7-4 SnowballC_0.7.0 tm_0.7-8 NLP_0.2-1
## [21] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [25] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 ggplot2_3.3.3
## [29] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17
## [4] rprojroot_2.0.2 fs_1.5.0 rstudioapi_0.13
## [7] remotes_2.2.0 prodlim_2019.11.13 fansi_0.4.1
## [10] lubridate_1.7.9.2 xml2_1.3.2 codetools_0.2-16
## [13] splines_4.0.3 knitr_1.30 pkgload_1.1.0
## [16] jsonlite_1.7.2 pROC_1.16.2 broom_0.7.3
## [19] dbplyr_2.0.0 compiler_4.0.3 httr_1.4.2
## [22] backports_1.2.0 lazyeval_0.2.2 assertthat_0.2.1
## [25] Matrix_1.2-18 cli_2.2.0 prettyunits_1.1.1
## [28] tools_4.0.3 gtable_0.3.0 glue_1.4.2
## [31] reshape2_1.4.4 Rcpp_1.0.5 slam_0.1-48
## [34] cellranger_1.1.0 vctrs_0.3.6 nlme_3.1-149
## [37] crosstalk_1.1.0.1 iterators_1.0.13 timeDate_3043.102
## [40] gower_0.2.2 xfun_0.19 ps_1.5.0
## [43] testthat_3.0.1 rvest_0.3.6 lifecycle_0.2.0
## [46] MASS_7.3-53 scales_1.1.1 ipred_0.9-9
## [49] hms_0.5.3 parallel_4.0.3 curl_4.3
## [52] yaml_2.2.1 memoise_1.1.0 stringi_1.5.3
## [55] highr_0.8 desc_1.2.0 foreach_1.5.1
## [58] pkgbuild_1.2.0 lava_1.6.8.1 rlang_0.4.10
## [61] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [64] labeling_0.4.2 recipes_0.1.15 htmlwidgets_1.5.3
## [67] tidyselect_1.1.0 processx_3.4.5 plyr_1.8.6
## [70] magrittr_2.0.1 R6_2.5.0 generics_0.1.0
## [73] DBI_1.1.0 pillar_1.4.7 haven_2.3.1
## [76] withr_2.3.0 survival_3.2-7 nnet_7.3-14
## [79] modelr_0.1.8 crayon_1.3.4 rmarkdown_2.6
## [82] grid_4.0.3 readxl_1.3.1 data.table_1.13.4
## [85] callr_3.5.1 ModelMetrics_1.2.2.2 reprex_0.3.0
## [88] digest_0.6.27 webshot_0.5.2 stats4_4.0.3
## [91] munsell_0.5.0 viridisLite_0.3.0 sessioninfo_1.1.1